diff --git a/lib/sources/pulp_mhsa_fp16.c b/lib/sources/pulp_mhsa_fp16.c
index e7b62e8a..45c0aacd 100644
--- a/lib/sources/pulp_mhsa_fp16.c
+++ b/lib/sources/pulp_mhsa_fp16.c
@@ -34,7 +34,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     fp16 *inputData = mhsa_args->input->data;                   //  Input vector (Transposed, E x L)
     fp16 *outData = mhsa_args->output->data;                    //  Output sequence (Transposed, E x L)
     fp16 *temp = mhsa_args->temp_buffer;                        //  Support buffer used in the attention head loop
-    fp16 *softmax_buffer = mhsa_args->softmax_buffer->data;     //  Buffer containing the softmax results (necessary to save for backward pass)
+    fp16 *softmax_buffer = mhsa_args->softmax_buffer->data;     //  Buffer containing the softmax results (necessary to save for backward pass); TODO: Only save in its entirety for bw; otherwise just for the respective head part
     fp16 *maxes = mhsa_args->maxes;                             //  Buffer containing the row-wise maxes in the softmax process
     fp16 *sums = mhsa_args->sums;                               //  Buffer containing the row-wise exponential sums in the softmax process
     fp16 *q = mhsa_args->q->data;                               //  Pointer to the first element of Q
@@ -48,9 +48,9 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     int E = mhsa_args->input->W;                                //  Input Sequence element size
     int F = mhsa_args->attention_map->W;                        //  Hidden dimension of attention (N. Heads * Head dimension)
 
-#ifdef DEBUG
+    #ifdef DEBUG
     printf("\n~~~~~~~~~~~~~~~FORWARD PASS~~~~~~~~~~~~~~~\n\nPrinting the parameters: L-%d, E-%d, F-%d", L, E, F);
-#endif
+    #endif
 
     int H = F / n_heads;                                        //  Head dimension
     fp16 scaling = (fp16)(1 / sqrt(H));                               //  Scaling factor to avoid vanishing gradients
@@ -76,8 +76,8 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     // T0_q
     struct transp_args_fp16 transp_args0_q;
 
-    int dim[] = {E, F};
-    int tr_axes[] = {1, 0};
+    int dim[2] = {E, F};
+    int tr_axes[2] = {1, 0};
 
     transp_args0_q.in_matrix = coeffDataWinQ;
     transp_args0_q.out_matrix = temp;
@@ -87,21 +87,21 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
 
     pi_cl_team_fork(NUM_CORES, transpose_fp16, &transp_args0_q);
 
-#ifdef DEBUG
+    #ifdef DEBUG
     printf("\n\n\nT0_q result\n\ncoeffDataWinQ [^T]: %d %d\n", E, F);
     for (int j=0; j<E*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ",  transp_args0_q.matrix[j]);
+        printf("%.8f ",  transp_args0_q.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args0_q.transp_matrix[j]);
+        printf("%.8f ", transp_args0_q.out_matrix[j]);
     }
     printf("\n\n");
-#endif
+    #endif
 
     // M1_q
     // Projecting input sequence into Q
@@ -113,19 +113,20 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     matMul_args1_q.K = E;
     matMul_args1_q.M = L;
     matMul_args1_q.trans_B = 0;
+    matMul_args1_q.USE_BIASES = 0;
 
-#ifndef OPTIMIZE
+    #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args1_q);
-#else
+    #else
     struct mm_manager_args_fp16 man_args1_q;
     man_args1_q.mm_args = &matMul_args1_q;
     man_args1_q.layer_type = LAYER_LINEAR;
     man_args1_q.step_type = STEP_FW;
     man_args1_q.matmul_type = opt_matmul_type; //MATMUL_TYPE
     pi_cl_team_fork(NUM_CORES, mm_manager_fp16, &man_args1_q);
-#endif
+    #endif
 
-#ifdef DEBUG
+    #ifdef DEBUG
     printf("\n\n\nM1_q result\n\ncoeffDataWinQ: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
@@ -146,7 +147,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         printf("%.8f ", matMul_args1_q.C[j]);
     }
     printf("\n\n");
-#endif
+    #endif
 
     // Bias_addition_q
     struct mm_bias_add_args_fp16 mm_bias_add_args_q;
@@ -160,7 +161,8 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     // T0_k
     struct transp_args_fp16 transp_args0_k;
 
-    dim[] = {E, F};
+    dim[0] = E;
+    dim[1] = F;
 
     transp_args0_k.in_matrix = coeffDataWinK;
     transp_args0_k.out_matrix = temp;
@@ -170,21 +172,21 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
 
     pi_cl_team_fork(NUM_CORES, transpose_fp16, &transp_args0_k);
 
-#ifdef DEBUG
+    #ifdef DEBUG
     printf("\n\n\nT0_k result\n\ncoeffDataWinK [^T]: %d %d\n", E, F);
     for (int j=0; j<E*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ",  transp_args0_k.matrix[j]);
+        printf("%.8f ",  transp_args0_k.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args0_k.transp_matrix[j]);
+        printf("%.8f ", transp_args0_k.out_matrix[j]);
     }
     printf("\n\n");
-#endif
+    #endif
 
     // M1_k
     // Projecting input sequence into K
@@ -196,19 +198,20 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     matMul_args1_k.K = E;
     matMul_args1_k.M = L;
     matMul_args1_k.trans_B = 0;
+    matMul_args1_k.USE_BIASES = 0;
 
-#ifndef OPTIMIZE
+    #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args1_k);
-#else
+    #else
     struct mm_manager_args_fp16 man_args1_k;
     man_args1_k.mm_args = &matMul_args1_k;
     man_args1_k.layer_type = LAYER_LINEAR;
     man_args1_k.step_type = STEP_FW;
     man_args1_k.matmul_type = opt_matmul_type; //MATMUL_TYPE
     pi_cl_team_fork(NUM_CORES, mm_manager_fp16, &man_args1_k);
-#endif
+    #endif
 
-#ifdef DEBUG
+    #ifdef DEBUG
     printf("\n\n\nM1_k result\n\ncoeffDataWinK: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
@@ -229,7 +232,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         printf("%.8f ", matMul_args1_k.C[j]);
     }
     printf("\n\n");
-#endif
+    #endif
 
     // Bias_addition_k
     struct mm_bias_add_args_fp16 mm_bias_add_args_k;
@@ -243,7 +246,8 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     // T0_v
     struct transp_args_fp16 transp_args0_v;
 
-    dim = {E, F};
+    dim[0] = E;
+    dim[1] = F;
 
     transp_args0_v.in_matrix = coeffDataWinV;
     transp_args0_v.out_matrix = temp;
@@ -253,21 +257,21 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
 
     pi_cl_team_fork(NUM_CORES, transpose_fp16, &transp_args0_v);
 
-#ifdef DEBUG
+    #ifdef DEBUG
     printf("\n\n\nT0_v result\n\ncoeffDataWinV [^T]: %d %d\n", E, F);
     for (int j=0; j<E*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ",  transp_args0_v.matrix[j]);
+        printf("%.8f ",  transp_args0_v.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args0_v.transp_matrix[j]);
+        printf("%.8f ", transp_args0_v.out_matrix[j]);
     }
     printf("\n\n");
-#endif
+    #endif
 
     // M1_v
     // Projecting input sequence into V
@@ -279,19 +283,20 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     matMul_args1_v.K = E;
     matMul_args1_v.M = L;
     matMul_args1_v.trans_B = 0;
+    matMul_args1_v.USE_BIASES = 0;
 
-#ifndef OPTIMIZE
+    #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args1_v);
-#else
+    #else
     struct mm_manager_args_fp16 man_args1_v;
     man_args1_v.mm_args = &matMul_args1_v;
     man_args1_v.layer_type = LAYER_LINEAR;
     man_args1_v.step_type = STEP_FW;
     man_args1_v.matmul_type = opt_matmul_type; //MATMUL_TYPE
     pi_cl_team_fork(NUM_CORES, mm_manager_fp16, &man_args1_v);
-#endif
+    #endif
 
-#ifdef DEBUG
+    #ifdef DEBUG
     printf("\n\n\nM1_v result\n\ncoeffDataWinV: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
@@ -312,7 +317,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         printf("%.8f ", matMul_args1_v.C[j]);
     }
     printf("\n\n");
-#endif
+    #endif
 
     // Bias_addition_v
     struct mm_bias_add_args_fp16 mm_bias_add_args_v;
@@ -336,7 +341,8 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         //  T1
         struct transp_args_fp16 transp_args1;
 
-        dim = {H, L};
+        dim[0] = H;
+        dim[1] = L;
 
         transp_args1.in_matrix = k + L * i * H;
         transp_args1.out_matrix = temp;
@@ -346,21 +352,21 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
 
         pi_cl_team_fork(NUM_CORES, transpose_fp16, &transp_args1);
 
-#ifdef DEBUG
+        #ifdef DEBUG
         printf("\n\n\nHead %d - T1 result\n\nk: %d %d\n", i, H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args1.matrix[j]);
+            printf("%.8f ",  transp_args1.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
-            printf("%.8f ", transp_args1.transp_matrix[j]);
+            printf("%.8f ", transp_args1.out_matrix[j]);
         }
         printf("\n\n");
-#endif
+        #endif
 
         // M2
         // Multiply it with the i-th head's Q chunk
@@ -372,20 +378,21 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         matMul_args2.K = H;
         matMul_args2.M = L;
         matMul_args2.trans_B = 0;
+        matMul_args2.USE_BIASES = 0;
 
-#ifndef OPTIMIZE
+        #ifndef OPTIMIZE
         pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args2);
-#else
+        #else
         struct mm_manager_args_fp16 man_args2;
         man_args2.mm_args = &matMul_args2;
         man_args2.layer_type = LAYER_LINEAR;
         man_args2.step_type = STEP_FW;
         man_args2.matmul_type = opt_matmul_type; //MATMUL_TYPE
         pi_cl_team_fork(NUM_CORES, mm_manager_fp16, &man_args2);
-#endif
+        #endif
 
-#ifdef DEBUG
-        printf("\n\n\nHead %d - M2 result\n\ntemp: %d %d\n", i, L, H);
+        #ifdef DEBUG
+        printf("\n\n\nOP 3: Head %d - M2 result\n\ntemp: %d %d\n", i, L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
             printf("%.8f ",  matMul_args2.A[j]);
@@ -405,7 +412,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
             printf("%.8f ", matMul_args2.C[j]);
         }
         printf("\n\n");
-#endif
+        #endif
 
 
         // ================================================== OP 4 ==================================================
@@ -417,25 +424,25 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         s_m_args.scalar = scaling;
         s_m_args.dim = L * L;
 
-#ifdef DEBUG
+        #ifdef DEBUG
         printf("\n\n\nHead %d - scalar result\n\nsoftmax_buffer (BEFORE scaling): %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
             printf("%.8f ", s_m_args.input[j]);
         }
         printf("\n\n");
-#endif
+        #endif
 
         pi_cl_team_fork(NUM_CORES, pulp_scalar_mul_fp16_cl, &s_m_args);
 
-#ifdef DEBUG
+        #ifdef DEBUG
         printf("\n\n\nsoftmax_buffer (AFTER scaling): %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
             printf("%.8f ", s_m_args.input[j]);
         }
         printf("\n\n");
-#endif
+        #endif
 
 
         // ================================================== OP 5 ==================================================
@@ -451,7 +458,8 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         // T2
         struct transp_args_fp16 transp_args2;
 
-        dim = {L, L};
+        dim[0] = L;
+        dim[1] = L;
 
         transp_args2.in_matrix = softmax_buffer + i * L * L;
         transp_args2.out_matrix = temp;
@@ -461,21 +469,21 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
 
         pi_cl_team_fork(NUM_CORES, transpose_fp16, &transp_args2);
 
-#ifdef DEBUG
+        #ifdef DEBUG
         printf("\n\n\nHead %d - T2 result\n\nsoftmax_buffer: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args2.matrix[j]);
+            printf("%.8f ",  transp_args2.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args2.transp_matrix[j]);
+            printf("%.8f ", transp_args2.out_matrix[j]);
         }
         printf("\n\n");
-#endif
+        #endif
 
         //  Softmax algorithm
         struct softmax_args_fp16 softmax_arg;
@@ -488,7 +496,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
 
         pulp_softmax_fp16_fw_cl(&softmax_arg);
 
-#ifdef DEBUG
+        #ifdef DEBUG
         printf("\n\n\nHead %d - softmax result\n\ntemp: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
@@ -502,7 +510,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
             printf("%.8f ", softmax_arg.output_data[j]);
         }
         printf("\n\n");
-#endif
+        #endif
 
 
         // ================================================== OP 6 ==================================================
@@ -517,7 +525,8 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         //  softmax buffer data following the H x L convention, therefore we need to transpose_fp16 the memory buffer again.
         struct transp_args_fp16 transp_args3;
 
-        dim = {L, L};
+        dim[0] = L;
+        dim[1] = L;
 
         transp_args3.in_matrix = softmax_buffer + i * L * L;
         transp_args3.out_matrix = temp;
@@ -527,21 +536,21 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
 
         pi_cl_team_fork(NUM_CORES, transpose_fp16, &transp_args3);
 
-#ifdef DEBUG
+        #ifdef DEBUG
         printf("\n\n\nHead %d - T2 result\n\nsoftmax_buffer: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args3.matrix[j]);
+            printf("%.8f ",  transp_args3.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args3.transp_matrix[j]);
+            printf("%.8f ", transp_args3.out_matrix[j]);
         }
         printf("\n\n");
-#endif
+        #endif
 
         // M3
         struct matMul_args_fp16 matMul_args3;
@@ -552,19 +561,20 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
         matMul_args3.K = L;
         matMul_args3.M = L;
         matMul_args3.trans_B = 0;
+        matMul_args3.USE_BIASES = 0;
 
-#ifndef OPTIMIZE
+        #ifndef OPTIMIZE
         pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args3);
-#else
+        #else
         struct mm_manager_args_fp16 man_args3;
         man_args3.mm_args = &matMul_args3;
         man_args3.layer_type = LAYER_LINEAR;
         man_args3.step_type = STEP_FW;
         man_args3.matmul_type = opt_matmul_type; //MATMUL_TYPE
         pi_cl_team_fork(NUM_CORES, mm_manager_fp16, &man_args3);
-#endif
+        #endif
 
-#ifdef DEBUG
+        #ifdef DEBUG
         printf("\n\n\nHead %d - M3 result\n\nv: %d %d\n", i, H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
@@ -585,7 +595,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
             printf("%.8f ", matMul_args3.C[j]);
         }
         printf("\n\n");
-#endif
+        #endif
     }
 
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ H -> F ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -604,19 +614,20 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
     matMul_args4.K = F;
     matMul_args4.M = L;
     matMul_args4.trans_B = 0;
+    matMul_args4.USE_BIASES = 0;
 
-#ifndef OPTIMIZE
+    #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args4);
-#else
+    #else
     struct mm_manager_args_fp16 man_args4;
     man_args4.mm_args = &matMul_args4;
     man_args4.layer_type = LAYER_LINEAR;
     man_args4.step_type = STEP_FW;
     man_args4.matmul_type = opt_matmul_type; //MATMUL_TYPE
     pi_cl_team_fork(NUM_CORES, mm_manager_fp16, &man_args4);
-#endif
+    #endif
 
-#ifdef DEBUG
+    #ifdef DEBUG
     printf("\n\n\nM4 result\n\ncoeffDataWout: %d %d\n", E, F);
         for (int j=0; j<E*F; j++){
             if(!(j%(F))) printf("\n");
@@ -637,7 +648,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) {
             printf("%.8f ", matMul_args4.C[j]);
         }
         printf("\n\n");
-#endif
+    #endif
 }
 
 
@@ -706,7 +717,8 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     // T1
     struct transp_args_fp16 transp_args1;
 
-    dim = {F, L};
+    int dim[2] = {F, L};
+    int tr_axes[2] = {1, 0};
 
     transp_args1.in_matrix = attention_map;
     transp_args1.out_matrix = temp;
@@ -721,14 +733,14 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     printf("\nT1 result\n\nattention_map: %d %d\n", F, L);
     for (int j=0; j<H*L; j++){
         if(!(j%(L))) printf("\n");
-        printf("%.8f ",  transp_args1.matrix[j]);
+        printf("%.8f ",  transp_args1.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", L, F);
     for (int j=0; j<L*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ", transp_args1.transp_matrix[j]);
+        printf("%.8f ", transp_args1.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -742,6 +754,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     matMul_args1.K = L;
     matMul_args1.M = F;
     matMul_args1.trans_B = 0;
+    matMul_args1.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args1);
@@ -780,7 +793,8 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     // T2
     struct transp_args_fp16 transp_args2;
 
-    dim = {E, F};
+    dim[0] = E;
+    dim[1] = F;
 
     transp_args2.in_matrix = coeffDataWout;
     transp_args2.out_matrix = temp;
@@ -794,14 +808,14 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     printf("\n\n\ncoeffDataWout: %d %d\n", E, F);
     for (int j=0; j<E*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ",  transp_args2.matrix[j]);
+        printf("%.8f ",  transp_args2.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args2.transp_matrix[j]);
+        printf("%.8f ", transp_args2.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -815,6 +829,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     matMul_args2.K = E;
     matMul_args2.M = L;
     matMul_args2.trans_B = 0;
+    matMul_args2.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args2);
@@ -882,6 +897,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         matMul_args3.K = L;
         matMul_args3.M = L;
         matMul_args3.trans_B = 0;
+        matMul_args3.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
         pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args3);
@@ -920,7 +936,8 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         // T3
         struct transp_args_fp16 transp_args3;
 
-        dim = {H, L};
+        dim[0] = H;
+        dim[1] = L;
 
         transp_args3.in_matrix = v + i * L * H;
         transp_args3.out_matrix = temp;
@@ -934,14 +951,14 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T3 result\n\nv: %d %d\n", i, H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args3.matrix[j]);
+            printf("%.8f ",  transp_args3.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
-            printf("%.8f ", transp_args3.transp_matrix[j]);
+            printf("%.8f ", transp_args3.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -955,6 +972,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         matMul_args4.K = H;
         matMul_args4.M = L;
         matMul_args4.trans_B = 0;
+        matMul_args4.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
         pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args4);
@@ -1006,7 +1024,8 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         // ~~~~~~~~~~~~~~~~~~ "IN-PLACE"_TRANSFORM (grad) [L x L] ~~~~~~~~~~~~~~~~~~                                              (T5 & C2)
 
         // T4
-        dim = {L, L};
+        dim[0] = L;
+        dim[1] = L;
 
         struct transp_args_fp16 transp_args4;
 
@@ -1022,14 +1041,14 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T4 result\n\nsoftmax_buffer: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args4.matrix[j]);
+            printf("%.8f ",  transp_args4.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args4.transp_matrix[j]);
+            printf("%.8f ", transp_args4.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1095,7 +1114,8 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         // T5
         struct transp_args_fp16 transp_args5;
 
-        dim = {L, L};
+        dim[0] = L;
+        dim[1] = L;
 
         transp_args5.in_matrix = grad;
         transp_args5.out_matrix = temp;
@@ -1109,14 +1129,14 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T5 result\n\ngrad: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args5.matrix[j]);
+            printf("%.8f ",  transp_args5.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args5.transp_matrix[j]);
+            printf("%.8f ", transp_args5.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1205,7 +1225,8 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         // T6
         struct transp_args_fp16 transp_args6;
 
-        dim = {H, L};
+        dim[0] = H;
+        dim[1] = L;
 
         transp_args6.in_matrix = q + i * L * H;
         transp_args6.out_matrix = temp;
@@ -1219,14 +1240,14 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T6 result\n\nq: %d %d\n", i, H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args6.matrix[j]);
+            printf("%.8f ",  transp_args6.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
-            printf("%.8f ", transp_args6.transp_matrix[j]);
+            printf("%.8f ", transp_args6.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1240,6 +1261,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         matMul_args5.K = L;
         matMul_args5.M = H;
         matMul_args5.trans_B = 0;
+        matMul_args5.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
         pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args5);
@@ -1278,7 +1300,8 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         // T7
         struct transp_args_fp16 transp_args7;
 
-        dim = {L, H};
+        dim[0] = L;
+        dim[1] = H;
 
         transp_args7.in_matrix = k_diff + i * L * H;
         transp_args7.out_matrix = temp;
@@ -1292,14 +1315,14 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T7 result\n\nk_diff: %d %d\n", i, L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
-            printf("%.8f ",  transp_args7.matrix[j]);
+            printf("%.8f ",  transp_args7.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args7.transp_matrix[j]);
+            printf("%.8f ", transp_args7.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1337,6 +1360,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
         matMul_args6.K = L;
         matMul_args6.M = L;
         matMul_args6.trans_B = 0;
+        matMul_args6.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
         pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args6);
@@ -1413,7 +1437,8 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     // T8
     struct transp_args_fp16 transp_args8;
 
-    dim = {E, L};
+    dim[0] = E;
+    dim[1] = L;
 
     transp_args8.in_matrix = inputData;
     transp_args8.out_matrix = temp;
@@ -1427,14 +1452,14 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     printf("\n\n\nT8 result\n\ninputData: %d %d\n", E, L);
     for (int j=0; j<E*L; j++){
         if(!(j%(L))) printf("\n");
-        printf("%.8f ",  transp_args8.matrix[j]);
+        printf("%.8f ",  transp_args8.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", L, E);
     for (int j=0; j<L*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args8.transp_matrix[j]);
+        printf("%.8f ", transp_args8.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -1448,6 +1473,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     matMul_args7_q.K = L;
     matMul_args7_q.M = E;
     matMul_args7_q.trans_B = 0;
+    matMul_args7_q.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args7_q);
@@ -1492,6 +1518,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     matMul_args7_k.K = L;
     matMul_args7_k.M = E;
     matMul_args7_k.trans_B = 0;
+    matMul_args7_k.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args7_k);
@@ -1536,6 +1563,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     matMul_args7_v.K = L;
     matMul_args7_v.M = E;
     matMul_args7_v.trans_B = 0;
+    matMul_args7_v.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args7_v);
@@ -1580,6 +1608,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     matMul_args8_q.K = F;
     matMul_args8_q.M = L;
     matMul_args8_q.trans_B = 0;
+    matMul_args8_q.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args8_q);
@@ -1641,6 +1670,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     matMul_args8_k.K = F;
     matMul_args8_k.M = L;
     matMul_args8_k.trans_B = 0;
+    matMul_args8_k.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args8_k);
@@ -1688,6 +1718,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) {
     matMul_args8_v.K = F;
     matMul_args8_v.M = L;
     matMul_args8_v.trans_B = 0;
+    matMul_args8_v.USE_BIASES = 0;
 
 #ifndef OPTIMIZE
     pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args8_v);
@@ -1826,8 +1857,8 @@ void pulp_mhsa_mobilebert_inference_fp16_fw_cl(void *Mhsa_args) {
     for (int i = 0; i < n_heads; i++) {
         //  T1
         struct transp_args_fp16 transp_args1;
-        transp_args1.matrix = kt + L * i * H;
-        transp_args1.transp_matrix = temp;
+        transp_args1.in_matrix = kt + L * i * H;
+        transp_args1.out_matrix = temp;
         transp_args1.N = H;
         transp_args1.M = L;
 
@@ -1880,8 +1911,8 @@ void pulp_mhsa_mobilebert_inference_fp16_fw_cl(void *Mhsa_args) {
         //  row-wise max and sums, therefore it is necessary to transpose the current head buffer.
         // T2
         struct transp_args_fp16 transp_args2;
-        transp_args2.matrix = softmax_buffer + i * L * L;
-        transp_args2.transp_matrix = temp;
+        transp_args2.in_matrix = softmax_buffer + i * L * L;
+        transp_args2.out_matrix = temp;
         transp_args2.N = L;
         transp_args2.M = L;
 
@@ -1940,8 +1971,8 @@ void pulp_mhsa_mobilebert_inference_fp16_fw_cl(void *Mhsa_args) {
         //  Each head result has to be appended to the full attention map, to do so we require to store the current
         //  softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again.
         struct transp_args_fp16 transp_args3;
-        transp_args3.matrix = softmax_buffer + i * L * L;
-        transp_args3.transp_matrix = temp;
+        transp_args3.in_matrix = softmax_buffer + i * L * L;
+        transp_args3.out_matrix = temp;
         transp_args3.N = L;
         transp_args3.M = L;
 
@@ -2004,8 +2035,8 @@ void pulp_mhsa_mobilebert_inference_fp16_fw_cl(void *Mhsa_args) {
     // T4
     // The last transpose to original shape
     struct transp_args_fp16 transp_args4;
-    transp_args4.matrix = temp;
-    transp_args4.transp_matrix = outData;
+    transp_args4.in_matrix = temp;
+    transp_args4.out_matrix = outData;
     transp_args4.N = F;
     transp_args4.M = L;
 
@@ -2122,17 +2153,17 @@ void tiled_transpose_mhsa_fp16(void *transpose_args, void* Tiled_matmul_mhsa_arg
     fp16* IN_DATA = BUFF;
     fp16* OUT_DATA = BUFF + tile_dim;
 
-    args_l1.matrix = IN_DATA;
-    args_l1.transp_matrix = OUT_DATA;
+    args_l1.in_matrix = IN_DATA;
+    args_l1.out_matrix = OUT_DATA;
     args_l1.N = tile_w;
     args_l1.M = tile_h;
     
     for(int i = 0; i < n_tiles_i; i++){
         for(int j = 0; j < n_tiles_j; j++){
-            pi_cl_dma_cmd_2d((uint32_t) (args->matrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 2 * tile_dim, 2 * M, 2 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load);
+            pi_cl_dma_cmd_2d((uint32_t) (args->in_matrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 2 * tile_dim, 2 * M, 2 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load);
             pi_cl_dma_cmd_wait(cmd_load);
             pi_cl_team_fork(NUM_CORES, transpose_fp16, &args_l1);
-            pi_cl_dma_cmd_2d((uint32_t) (args->transp_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 2 * tile_dim, 2 * N, 2 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store);
+            pi_cl_dma_cmd_2d((uint32_t) (args->out_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 2 * tile_dim, 2 * N, 2 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store);
             pi_cl_dma_cmd_wait(cmd_store);
         }
     }
@@ -2225,8 +2256,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){
     for (int i = 0; i < n_heads; i++) {
         //  T1
         struct transp_args_fp16 transp_args1;
-        transp_args1.matrix = kt + L * i * H;
-        transp_args1.transp_matrix = temp;
+        transp_args1.in_matrix = kt + L * i * H;
+        transp_args1.out_matrix = temp;
         transp_args1.N = H;
         transp_args1.M = L;
         
@@ -2279,8 +2310,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){
         //  row-wise max and sums, therefore it is necessary to transpose the current head buffer.
         // T2
         struct transp_args_fp16 transp_args2;
-        transp_args2.matrix = softmax_buffer + i * L * L;
-        transp_args2.transp_matrix = temp;
+        transp_args2.in_matrix = softmax_buffer + i * L * L;
+        transp_args2.out_matrix = temp;
         transp_args2.N = L;
         transp_args2.M = L;
 
@@ -2329,8 +2360,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){
         //  Each head result has to be appended to the full attention map, to do so we require to store the current
         //  softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again.
         struct transp_args_fp16 transp_args3;
-        transp_args3.matrix = softmax_buffer + i * L * L;
-        transp_args3.transp_matrix = temp;
+        transp_args3.in_matrix = softmax_buffer + i * L * L;
+        transp_args3.out_matrix = temp;
         transp_args3.N = L;
         transp_args3.M = L;
 
@@ -2376,8 +2407,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){
     // T4
     // The last transpose to original shape
     struct transp_args_fp16 transp_args4;
-    transp_args4.matrix = temp;
-    transp_args4.transp_matrix = outData;
+    transp_args4.in_matrix = temp;
+    transp_args4.out_matrix = outData;
     transp_args4.N = mhsa_args->input_bn->W;
     transp_args4.M = L;
 
diff --git a/lib/sources/pulp_mhsa_fp32.c b/lib/sources/pulp_mhsa_fp32.c
index 8bb70edf..b4147044 100644
--- a/lib/sources/pulp_mhsa_fp32.c
+++ b/lib/sources/pulp_mhsa_fp32.c
@@ -75,7 +75,7 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
 
     // T0_q
     int dims[] = {E, F};
-    int t_axes = {1, 0};
+    int t_axes[] = {1, 0};
 
     struct transp_args transp_args0_q;
     transp_args0_q.in_matrix = coeffDataWinQ;
@@ -90,14 +90,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
     printf("\n\n\nT0_q result\n\ncoeffDataWinQ [^T]: %d %d\n", E, F);
     for (int j=0; j<E*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ",  transp_args0_q.matrix[j]);
+        printf("%.8f ",  transp_args0_q.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args0_q.transp_matrix[j]);
+        printf("%.8f ", transp_args0_q.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -158,7 +158,8 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
     pi_cl_team_fork(NUM_CORES, mm_bias_add_transposed, &mm_bias_add_args_q);
 
     // T0_k
-    dims = {E, F};
+    dims[0] = E;
+    dims[1] = F;
 
     struct transp_args transp_args0_k;
     transp_args0_k.in_matrix = coeffDataWinK;
@@ -173,14 +174,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
     printf("\n\n\nT0_k result\n\ncoeffDataWinK [^T]: %d %d\n", E, F);
     for (int j=0; j<E*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ",  transp_args0_k.matrix[j]);
+        printf("%.8f ",  transp_args0_k.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args0_k.transp_matrix[j]);
+        printf("%.8f ", transp_args0_k.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -241,7 +242,8 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
     pi_cl_team_fork(NUM_CORES, mm_bias_add_transposed, &mm_bias_add_args_k);
 
     // T0_v
-    dims = {E, F};
+    dims[0] = E;
+    dims[1] = F;
 
     struct transp_args transp_args0_v;
 
@@ -257,14 +259,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
     printf("\n\n\nT0_v result\n\ncoeffDataWinV [^T]: %d %d\n", E, F);
     for (int j=0; j<E*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ",  transp_args0_v.matrix[j]);
+        printf("%.8f ",  transp_args0_v.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args0_v.transp_matrix[j]);
+        printf("%.8f ", transp_args0_v.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -335,7 +337,8 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~    L x H     @ H x L ->      L x L     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
         //  T1
-        dims = {H, L};
+        dims[0] = H;
+        dims[1] = L;
 
         struct transp_args transp_args1;
 
@@ -351,14 +354,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T1 result\n\nk: %d %d\n", i, H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args1.matrix[j]);
+            printf("%.8f ",  transp_args1.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
-            printf("%.8f ", transp_args1.transp_matrix[j]);
+            printf("%.8f ", transp_args1.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -451,7 +454,8 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
         //  head buffer is transposed. To achieve the best experimental accuracy, the Softmax algorithm requires to compute
         //  row-wise max and sums, therefore it is necessary to transpose the current head buffer.
         // T2
-        dims[] = {L, L};
+        dims[0] = L;
+        dims[1] = L;
 
         struct transp_args transp_args2;
 
@@ -467,14 +471,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T2 result\n\nsoftmax_buffer: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args2.matrix[j]);
+            printf("%.8f ",  transp_args2.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args2.transp_matrix[j]);
+            printf("%.8f ", transp_args2.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -517,7 +521,8 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
         // T3
         //  Each head result has to be appended to the full attention map, to do so we require to store the current
         //  softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again.
-        dims[] = {L, L};
+        dims[0] = L;
+        dims[1] = L;
 
         struct transp_args transp_args3;
 
@@ -533,14 +538,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T2 result\n\nsoftmax_buffer: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args3.matrix[j]);
+            printf("%.8f ",  transp_args3.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args3.transp_matrix[j]);
+            printf("%.8f ", transp_args3.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -727,14 +732,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
     printf("\nT1 result\n\nattention_map: %d %d\n", F, L);
     for (int j=0; j<H*L; j++){
         if(!(j%(L))) printf("\n");
-        printf("%.8f ",  transp_args1.matrix[j]);
+        printf("%.8f ",  transp_args1.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", L, F);
     for (int j=0; j<L*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ", transp_args1.transp_matrix[j]);
+        printf("%.8f ", transp_args1.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -785,12 +790,13 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
 #endif
 
     // T2
-    dims = {E, F};
+    dims[0] = E;
+    dims[1] = F;
 
     struct transp_args transp_args2;
 
-    transp_args2.matrix = coeffDataWout;
-    transp_args2.transp_matrix = temp;
+    transp_args2.in_matrix = coeffDataWout;
+    transp_args2.out_matrix = temp;
     transp_args2.dim = dims;
     transp_args2.transposed_axes = t_axes;
     transp_args2.n_dim = 2;
@@ -801,14 +807,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
     printf("\n\n\ncoeffDataWout: %d %d\n", E, F);
     for (int j=0; j<E*F; j++){
         if(!(j%(F))) printf("\n");
-        printf("%.8f ",  transp_args2.matrix[j]);
+        printf("%.8f ",  transp_args2.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", F, E);
     for (int j=0; j<F*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args2.transp_matrix[j]);
+        printf("%.8f ", transp_args2.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -927,7 +933,8 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
 #endif
 
         // T3
-        dims[] = {L, H};
+        dims[0] = L;
+        dims[1] = H;
 
         struct transp_args transp_args3;
 
@@ -943,14 +950,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T3 result\n\nv: %d %d\n", i, H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args3.matrix[j]);
+            printf("%.8f ",  transp_args3.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
-            printf("%.8f ", transp_args3.transp_matrix[j]);
+            printf("%.8f ", transp_args3.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1016,7 +1023,8 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
         // ~~~~~~~~~~~~~~~~~~ "IN-PLACE"_TRANSFORM (grad) [L x L] ~~~~~~~~~~~~~~~~~~                                              (T5 & C2)
 
         // T4
-        dims[] = {L, L};
+        dims[0] = L;
+        dims[1] = L;
 
         struct transp_args transp_args4;
 
@@ -1032,14 +1040,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T4 result\n\nsoftmax_buffer: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args4.matrix[j]);
+            printf("%.8f ",  transp_args4.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args4.transp_matrix[j]);
+            printf("%.8f ", transp_args4.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1103,7 +1111,8 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
 #endif
 
         // T5
-        dims[] = {L, L};
+        dims[0] = L;
+        dims[1] = L;
 
         struct transp_args transp_args5;
 
@@ -1119,14 +1128,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T5 result\n\ngrad: %d %d\n", i, L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args5.matrix[j]);
+            printf("%.8f ",  transp_args5.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, L);
         for (int j=0; j<L*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args5.transp_matrix[j]);
+            printf("%.8f ", transp_args5.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1213,7 +1222,8 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ H x L @ L x L -> H x L  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
         // T6
-        dims[] = {H, L};
+        dims[0] = H;
+        dims[1] = L;
 
         struct transp_args transp_args6;
 
@@ -1229,14 +1239,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T6 result\n\nq: %d %d\n", i, H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ",  transp_args6.matrix[j]);
+            printf("%.8f ",  transp_args6.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
-            printf("%.8f ", transp_args6.transp_matrix[j]);
+            printf("%.8f ", transp_args6.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1287,7 +1297,8 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
 #endif
 
         // T7
-        dims[] = {L, H};
+        dims[0] = L;
+        dims[1] = H;
 
         struct transp_args transp_args7;
 
@@ -1303,14 +1314,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
         printf("\n\n\nHead %d - T7 result\n\nk_diff: %d %d\n", i, L, H);
         for (int j=0; j<L*H; j++){
             if(!(j%(H))) printf("\n");
-            printf("%.8f ",  transp_args7.matrix[j]);
+            printf("%.8f ",  transp_args7.in_matrix[j]);
         }
         printf("\n");
 
         printf("\ntemp: %d %d\n", H, L);
         for (int j=0; j<H*L; j++){
             if(!(j%(L))) printf("\n");
-            printf("%.8f ", transp_args7.transp_matrix[j]);
+            printf("%.8f ", transp_args7.out_matrix[j]);
         }
         printf("\n\n");
 #endif
@@ -1423,7 +1434,8 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
     // ~~~~~~~~~~~~~~~~~~~~~~   E x L   + E x L ->   E x L    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     // T8
-    dims[] = {E, L};
+    dims[0] = E;
+    dims[1] = L;
 
     struct transp_args transp_args8;
 
@@ -1439,14 +1451,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
     printf("\n\n\nT8 result\n\ninputData: %d %d\n", E, L);
     for (int j=0; j<E*L; j++){
         if(!(j%(L))) printf("\n");
-        printf("%.8f ",  transp_args8.matrix[j]);
+        printf("%.8f ",  transp_args8.in_matrix[j]);
     }
     printf("\n");
 
     printf("\ntemp: %d %d\n", L, E);
     for (int j=0; j<L*E; j++){
         if(!(j%(E))) printf("\n");
-        printf("%.8f ", transp_args8.transp_matrix[j]);
+        printf("%.8f ", transp_args8.out_matrix[j]);
     }
     printf("\n\n");
 #endif
@@ -1632,21 +1644,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) {
 #endif
 
     // SUM_q
-    int dims[] = {E * L};
-
     struct vect_sum_args vect_sum_args;
 
     vect_sum_args.op_1 = inputDiff;
     vect_sum_args.op_2 = temp;
-    vect_sum_args.dest = inputDiff;
-
-    vect_sum_args.op_1_dims = dims;
-    vect_sum_args.op_2_dims = dims;
-
-    vect_sum_args.op_1_dims_len = 1;
-    vect_sum_args.op_2_dims_len = 1;
+    vect_sum_args.dest = inputDiff;;
+    vect_sum_args.size = E * L;
 
-    pi_cl_team_fork(NUM_CORES, array_broadcast_sum_fp32, &vect_sum_args);
+    pi_cl_team_fork(NUM_CORES, vect_sum, &vect_sum_args);
 
     // M8_k
     struct matMul_args matMul_args8_k;
@@ -1844,8 +1849,8 @@ void pulp_mhsa_mobilebert_inference_fp32_fw_cl(void *Mhsa_args) {
     for (int i = 0; i < n_heads; i++) {
         //  T1
         struct transp_args transp_args1;
-        transp_args1.matrix = kt + L * i * H;
-        transp_args1.transp_matrix = temp;
+        transp_args1.in_matrix = kt + L * i * H;
+        transp_args1.out_matrix = temp;
         transp_args1.N = H;
         transp_args1.M = L;
 
@@ -1898,8 +1903,8 @@ void pulp_mhsa_mobilebert_inference_fp32_fw_cl(void *Mhsa_args) {
         //  row-wise max and sums, therefore it is necessary to transpose the current head buffer.
         // T2
         struct transp_args transp_args2;
-        transp_args2.matrix = softmax_buffer + i * L * L;
-        transp_args2.transp_matrix = temp;
+        transp_args2.in_matrix = softmax_buffer + i * L * L;
+        transp_args2.out_matrix = temp;
         transp_args2.N = L;
         transp_args2.M = L;
 
@@ -1958,8 +1963,8 @@ void pulp_mhsa_mobilebert_inference_fp32_fw_cl(void *Mhsa_args) {
         //  Each head result has to be appended to the full attention map, to do so we require to store the current
         //  softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again.
         struct transp_args transp_args3;
-        transp_args3.matrix = softmax_buffer + i * L * L;
-        transp_args3.transp_matrix = temp;
+        transp_args3.in_matrix = softmax_buffer + i * L * L;
+        transp_args3.out_matrix = temp;
         transp_args3.N = L;
         transp_args3.M = L;
 
@@ -2022,8 +2027,8 @@ void pulp_mhsa_mobilebert_inference_fp32_fw_cl(void *Mhsa_args) {
     // T4
     // The last transpose to original shape
     struct transp_args transp_args4;
-    transp_args4.matrix = temp;
-    transp_args4.transp_matrix = outData;
+    transp_args4.in_matrix = temp;
+    transp_args4.out_matrix = outData;
     transp_args4.N = F;
     transp_args4.M = L;
 
@@ -2143,17 +2148,17 @@ void tiled_transpose_mhsa(void *transpose_args, void* Tiled_matmul_mhsa_args, in
     float* IN_DATA = BUFF;
     float* OUT_DATA = BUFF + tile_dim;
 
-    args_l1.matrix = IN_DATA;
-    args_l1.transp_matrix = OUT_DATA;
+    args_l1.in_matrix = IN_DATA;
+    args_l1.out_matrix = OUT_DATA;
     args_l1.N = tile_w;
     args_l1.M = tile_h;
     
     for(int i = 0; i < n_tiles_i; i++){
         for(int j = 0; j < n_tiles_j; j++){
-            pi_cl_dma_cmd_2d((uint32_t) (args->matrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 4 * tile_dim, 4 * M, 4 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load);
+            pi_cl_dma_cmd_2d((uint32_t) (args->in_matrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 4 * tile_dim, 4 * M, 4 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load);
             pi_cl_dma_cmd_wait(cmd_load);
             pi_cl_team_fork(NUM_CORES, transpose, &args_l1);
-            pi_cl_dma_cmd_2d((uint32_t) (args->transp_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 4 * tile_dim, 4 * N, 4 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store);
+            pi_cl_dma_cmd_2d((uint32_t) (args->out_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 4 * tile_dim, 4 * N, 4 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store);
             pi_cl_dma_cmd_wait(cmd_store);
         }
     }
@@ -2248,8 +2253,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){
     for (int i = 0; i < n_heads; i++) {
         //  T1
         struct transp_args transp_args1;
-        transp_args1.matrix = kt + L * i * H;
-        transp_args1.transp_matrix = temp;
+        transp_args1.in_matrix = kt + L * i * H;
+        transp_args1.out_matrix = temp;
         transp_args1.N = H;
         transp_args1.M = L;
 
@@ -2303,8 +2308,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){
         //  row-wise max and sums, therefore it is necessary to transpose the current head buffer.
         // T2
         struct transp_args transp_args2;
-        transp_args2.matrix = softmax_buffer + i * L * L;
-        transp_args2.transp_matrix = temp;
+        transp_args2.in_matrix = softmax_buffer + i * L * L;
+        transp_args2.out_matrix = temp;
         transp_args2.N = L;
         transp_args2.M = L;
 
@@ -2354,8 +2359,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){
         //  Each head result has to be appended to the full attention map, to do so we require to store the current
         //  softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again.
         struct transp_args transp_args3;
-        transp_args3.matrix = softmax_buffer + i * L * L;
-        transp_args3.transp_matrix = temp;
+        transp_args3.in_matrix = softmax_buffer + i * L * L;
+        transp_args3.out_matrix = temp;
         transp_args3.N = L;
         transp_args3.M = L;
 
@@ -2401,8 +2406,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){
     // T4
     // The last transpose to original shape
     struct transp_args transp_args4;
-    transp_args4.matrix = temp;
-    transp_args4.transp_matrix = outData;
+    transp_args4.in_matrix = temp;
+    transp_args4.out_matrix = outData;
     transp_args4.N = mhsa_args->input_bn->W;
     transp_args4.M = L;
 
@@ -2460,8 +2465,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){
 
     // T1
     struct transp_args transp_args1;
-    transp_args1.matrix = attention_map;
-    transp_args1.transp_matrix = temp;
+    transp_args1.in_matrix = attention_map;
+    transp_args1.out_matrix = temp;
     transp_args1.N = F;
     transp_args1.M = L;
 
@@ -2491,8 +2496,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){
 
     // T2
     struct transp_args transp_args2;
-    transp_args2.matrix = coeffDataWout;
-    transp_args2.transp_matrix = temp;
+    transp_args2.in_matrix = coeffDataWout;
+    transp_args2.out_matrix = temp;
     transp_args2.N = E;
     transp_args2.M = F;
 
@@ -2546,8 +2551,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){
 
         // T3
         struct transp_args transp_args3;
-        transp_args3.matrix = v + i * L * H;
-        transp_args3.transp_matrix = temp;
+        transp_args3.in_matrix = v + i * L * H;
+        transp_args3.out_matrix = temp;
         transp_args3.N = H;
         transp_args3.M = L;
 
@@ -2601,8 +2606,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){
     for (int i = 0; i < n_heads; i++) {
         // T4
         struct transp_args transp_args4;
-        transp_args4.matrix = softmax_buffer_diff + i * L * L;
-        transp_args4.transp_matrix = temp;
+        transp_args4.in_matrix = softmax_buffer_diff + i * L * L;
+        transp_args4.out_matrix = temp;
         transp_args4.N = L;
         transp_args4.M = L;
 
@@ -2621,8 +2626,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){
 
         // T5
         struct transp_args transp_args5;
-        transp_args5.matrix = grad;
-        transp_args5.transp_matrix = temp;
+        transp_args5.in_matrix = grad;
+        transp_args5.out_matrix = temp;
         transp_args5.N = L;
         transp_args5.M = L;
 
@@ -2645,8 +2650,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){
 
         // T6
         struct transp_args transp_args6;
-        transp_args6.matrix = q + i * L * H;
-        transp_args6.transp_matrix = temp;
+        transp_args6.in_matrix = q + i * L * H;
+        transp_args6.out_matrix = temp;
         transp_args6.N = H;
         transp_args6.M = L;
 
@@ -2676,8 +2681,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){
 
         // T7
         struct transp_args transp_args7;
-        transp_args7.matrix = k_diff + i * L * H;
-        transp_args7.transp_matrix = temp;
+        transp_args7.in_matrix = k_diff + i * L * H;
+        transp_args7.out_matrix = temp;
         transp_args7.N = L;
         transp_args7.M = H;
 
diff --git a/lib/sources/pulp_residual_fp16.c b/lib/sources/pulp_residual_fp16.c
index ba0917b8..72d033c0 100644
--- a/lib/sources/pulp_residual_fp16.c
+++ b/lib/sources/pulp_residual_fp16.c
@@ -41,20 +41,14 @@ void pulp_residualconn_fp16_fw(void *SkipConn_args_fp16) {
         return;
     }
 
-    int dims[] = {out->dim};
-
     struct vect_sum_args_fp16 args_sum;
+    
     args_sum.op_1 = skip->data;
     args_sum.op_2 = lout->data;
     args_sum.dest = out->data;
+    args_sum.size = out->dim;
 
-    args_sum.op_1_dims = dims;
-    args_sum.op_2_dims = dims;
-
-    args_sum.op_1_dims_len = 1;
-    args_sum.op_2_dims_len = 1;
-
-    pi_cl_team_fork(NUM_CORES, array_broadcast_sum_fp16, &args_sum);
+    pi_cl_team_fork(NUM_CORES, vect_sum_fp16, &args_sum);
 }
 
 
@@ -77,21 +71,14 @@ void pulp_sumnode_fp16_bw(void *SkipConn_args_fp16) {
             return;
         }
 
-        int dims[] = {skip->dim};
-
         struct vect_sum_args_fp16 args_sum;
 
         args_sum.op_1 = out->diff;
         args_sum.op_2 = skip->diff;
         args_sum.dest = skip->diff;
+        args_sum.size = skip->dim;
 
-        args_sum.op_1_dims = dims;
-        args_sum.op2_dims = dims;
-
-        args_sum.op_1_dims_len = 1;
-        args_sum.op_2_dims_len = 1;
-
-        pi_cl_team_fork(NUM_CORES, array_broadcast_sum_fp16, &args_sum);
+        pi_cl_team_fork(NUM_CORES, vect_sum_fp16, &args_sum);
     }
 }
 
diff --git a/lib/sources/pulp_rnn_fp32.c b/lib/sources/pulp_rnn_fp32.c
index 8852a715..a8b9ce43 100644
--- a/lib/sources/pulp_rnn_fp32.c
+++ b/lib/sources/pulp_rnn_fp32.c
@@ -244,13 +244,13 @@ void pulp_rnn_fp32_bw_cl(void *Rnn_args) {
 
     // Calculate gradient for State Weights
     // Transpose State
-    int dims[] = {N, M};
-    int t_axes[] = {1, 0};
+    dims[0] = N;
+    dims[1] = M;
 
     struct transp_args transp_args2;
 
-    transp_args2.matrix = hiddState;
-    transp_args2.transp_matrix = temp;
+    transp_args2.in_matrix = hiddState;
+    transp_args2.out_matrix = temp;
     transp_args2.dim = dims;
     transp_args2.transposed_axes = t_axes;
     transp_args2.n_dim = 2;
@@ -301,7 +301,8 @@ void pulp_rnn_fp32_bw_cl(void *Rnn_args) {
 
     // Calculate the Gradient of the Input
     // Transpose Input Weights
-    dims = {K, M};
+    dims[0] = K;
+    dims[1] = M;
 
     struct transp_args transp_args3;
 
diff --git a/tests/test_matmul/utils/GM.py b/tests/test_matmul/utils/GM.py
index dec5b153..33b952e6 100644
--- a/tests/test_matmul/utils/GM.py
+++ b/tests/test_matmul/utils/GM.py
@@ -127,12 +127,41 @@
             B.transpose(0, 1)
         else:
             C = torch.mm(input=A, mat2=B, out=C)
+    elif (data_type == 'bf16'):
+        # Matrices to be multiplied
+        A = torch.Tensor(in_size, mid_size).to(torch.bfloat16)
+        if transp == '1':
+            B = torch.Tensor(out_size, mid_size).to(torch.bfloat16)
+        else:
+            B = torch.Tensor(mid_size, out_size).to(torch.bfloat16)
+        C = torch.Tensor(in_size, out_size).to(torch.bfloat16)
+
+        A = torch.div(torch.randn(in_size, mid_size), divider).to(torch.bfloat16)
+        for i in range(A.shape[0]):
+            for j in range(A.shape[1]):
+                A[i][j] += (i+j+0.1)/divider
+
+        if transp == '1':
+            B = torch.zeros(out_size, mid_size).to(torch.bfloat16)
+        else:
+            B = torch.zeros(mid_size, out_size).to(torch.bfloat16)
+
+        for i in range(B.shape[0]):
+            for j in range(B.shape[1]):
+                B[i][j] = i*j+0.1
+
+        if transp == '1':
+            C = torch.mm(input=A, mat2=B.transpose(0, 1), out=C)
+            B.transpose(0, 1)
+        else:
+            C = torch.mm(input=A, mat2=B, out=C)
 
     else :  # Error message
         print('Invalid data type selection!!')
         exit()
 
-
+    if data_type == 'bf16':
+        data_type = 'fp16'
 
     # Print data and create data header file
     f = open('net_args.h', "w") 
diff --git a/tests/test_mhsa_fp16/Makefile b/tests/test_mhsa_fp16/Makefile
index 0b4acd74..107907a1 100644
--- a/tests/test_mhsa_fp16/Makefile
+++ b/tests/test_mhsa_fp16/Makefile
@@ -1,10 +1,10 @@
 APP = mhsa_fp16
 
 # User settings
-IN_H?=196 # Sequence Length
-IN_W?=160 # Token Size
-N_HEADS?=5
-ATT_DIM?=160 #Hidden dimension
+IN_H?=20 # Sequence Length
+IN_W?=40 # Token Size
+N_HEADS?=2
+ATT_DIM?=40 #Hidden dimension
 
 IN_CH?=1
 OUT_CH?=1
diff --git a/tests/test_mhsa_fp16/net.h b/tests/test_mhsa_fp16/net.h
index 7e4fabca..2149e63d 100644
--- a/tests/test_mhsa_fp16/net.h
+++ b/tests/test_mhsa_fp16/net.h
@@ -34,8 +34,8 @@
 #define Tker_l0     (Tin_l0*Tout_l0)
 
 // Tensor checksum definition
-#define CHECK_TOLERANCE 0.001
-#define ERROR_TOLERANCE 0.001
+#define CHECK_TOLERANCE 0x00000021
+#define ERROR_TOLERANCE 0x00000001
 
 // PULP DEFINES
 #define STACK_SIZE      4096
diff --git a/tests/test_mhsa_fp16/utils/GM.py b/tests/test_mhsa_fp16/utils/GM.py
index 6dd68596..78b6a95d 100644
--- a/tests/test_mhsa_fp16/utils/GM.py
+++ b/tests/test_mhsa_fp16/utils/GM.py
@@ -1,402 +1,402 @@
-"""
-Copyright (C) 2021-2022 ETH Zurich and University of Bologna
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-Authors: Francesco Conoscenti (francesco.conoscenti@studio.unibo.it), Alberto Dequino (alberto.dequino@unibo.it),
-         Calin Diaconu (calin.diaconu@studio.unibo.it)
-"""
-
-import argparse
-from copy import deepcopy
-
-import numpy as np  # Matrix and vector computation package
-import torch
-import torch.nn as nn
-
-import dump_utils as dump
-import mhsa
-
-
-class MyNet(nn.Module):
-    # Define a simple network with a mhsa layer for testing
-    def __init__(self, in_w, n_heads, att_dim, bf16_format):
-        super().__init__()
-        self.mhsa = mhsa.MultiHeadedSelfAttention(
-            dim=in_w, num_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format
-        )
-
-    def forward(self, x, tgt_len):
-        return self.mhsa(x=x, tgt_len=tgt_len)
-
-
-def hook_fn1(_, __, o):
-    # Hook to write output gradients
-    f = open("mhsa-grads.h", "w")
-
-    print("------------Output Grad------------")
-    for grad in o:
-        try:
-            output_grad = torch.transpose(grad, 0, 1)
-            f.write("#define G_OUTPUT_SIZE " + str(output_grad.numel()) + "\n")
-            print(output_grad)
-
-            if current_step == "BACKWARD":
-                f.write(
-                    "PI_L2 fp16 OUTPUT_GRAD[G_OUTPUT_SIZE] = {"
-                    + dump.tensor_to_string(output_grad)
-                    + "};\n"
-                )
-            else:
-                f.write(
-                    "PI_L2 fp16 OUTPUT_GRAD[G_OUTPUT_SIZE] = {"
-                    + dump.tensor_to_string(output_grad)
-                    + "};\n"
-                )
-        except AttributeError:
-            print("None found for Gradient (output)")
-
-    f.close()
-
-
-def hook_fn2(_, __, o):
-    # Hook for writing output to file
-    cont = 0
-    f = open("mhsa-output.h", "w")
-
-    print("------------Output------------")
-    for grad in o:
-        try:
-            if cont == 0:
-                output_grad = grad
-                f.write("#define OUTPUT_SIZE " + str(output_grad.numel()) + "\n")
-
-                if bf16_format == 0:
-                    print(output_grad.half())
-                    f.write(
-                        "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {"
-                        + dump.tensor_to_string(output_grad)
-                        + "};\n"
-                    )
-                else:
-                    print(output_grad.bfloat16())
-                    f.write(
-                        "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {"
-                        + dump.tensor_to_string(output_grad)
-                        + "};\n"
-                    )
-
-            cont += 1
-        except AttributeError:
-            print("None found for Output")
-
-    f.close()
-
-
-if __name__ == "__main__":
-    # ~~~~~~~~~~ INTRO ~~~~~~~~~~
-    # Set the seed for reproducibility
-    np.random.seed(seed=1)  # <----- Sneed
-    torch.manual_seed(0)
-
-    # Visualize data with more precision
-    torch.set_printoptions(precision=10, sci_mode=False)
-
-    # Set up parser
-    parser = argparse.ArgumentParser("MHSA Layer Test")
-    parser.add_argument("--in_width", type=int, default=8)  # Token size
-    parser.add_argument("--in_height", type=int, default=4)  # Sequence length
-    parser.add_argument("--ch_in", type=int, default=1)
-    parser.add_argument("--ch_out", type=int, default=1)
-    parser.add_argument("--n_heads", type=int, default=8)
-    parser.add_argument("--weight", type=float, default=0.1)
-    parser.add_argument("--att_dim", type=int, default=8)
-    parser.add_argument(
-        "--bf16_format", type=int, default=1
-    )  # if == 1, data format if bfloat16, if 0 is float16
-    # Possible steps: FORWARD, BACKWARD_GRAD, BACKWARD_ERROR
-    parser.add_argument("--step", type=str, default="FORWARD")
-
-    args = parser.parse_args()
-
-    # Read arguments
-    in_h = args.in_height
-    in_w = args.in_width
-    ch_in = args.ch_in
-    ch_out = args.ch_out
-    n_heads = args.n_heads
-    current_step = args.step
-    weight_init = args.weight
-    att_dim = args.att_dim
-    head_dim = int(att_dim / n_heads)
-    bf16_format = args.bf16_format
-
-    # Write net step to file
-    f_step = open("step-check.h", "w")
-    f_step.write("#define " + str(current_step) + "\n")
-    f_step.close()
-
-    # Write input/output weights to file
-    f = open("init-defines.h", "w")
-
-    f.write("#define Tin_C_l1 " + str(ch_in) + "\n")
-    f.write("#define Tin_H_l1 " + str(in_h) + "\n")
-    f.write("#define Tin_W_l1 " + str(in_w) + "\n")
-    f.write("#define Tout_C_l1 " + str(ch_out) + "\n")
-    f.write("#define Tn_heads_l1 " + str(n_heads) + "\n")
-    f.write("#define Tatt_dim_l1 " + str(att_dim) + "\n")
-    f.write("#define Thead_dim_l1 " + str(head_dim) + "\n")
-    if current_step == "FORWARD":
-        f.write(
-            "#define Ttemp_max " + str(int(max(in_h * head_dim, in_h * in_h))) + "\n"
-        )
-    else:
-        f.write(
-            "#define Ttemp_max "
-            + str(
-                int(max(in_h * att_dim, 3 * att_dim * in_w, in_h * in_h, in_h * in_w))
-            )
-            + "\n"
-        )
-
-    f.close()
-
-    # Define network and add hook
-    if bf16_format == 0:
-        net = MyNet(
-            in_w=in_w, n_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format
-        ).half()
-    elif bf16_format == 1:
-        net = MyNet(
-            in_w=in_w, n_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format
-        ).bfloat16()
-    net.zero_grad()
-
-    gradsRnn = net.mhsa.register_full_backward_hook(hook_fn1)
-
-    # ~~~~~~~~~~ MANAGE INPUT ~~~~~~~~~~
-    # Generate random input data
-    inp = torch.randn(ch_in, in_h, in_w)
-
-    # Print input data to terminal
-    print("------------Input sequence------------")
-    print(inp)
-
-    # Write transpose of input data to file
-    inp_copy = torch.transpose(inp, -1, -2)
-
-    f = open("input-sequence.h", "w")
-    f.write("#define INPUT_SIZE " + str(inp.numel()) + "\n")
-    f.write(
-        "PI_L2 fp16 INPUT[INPUT_SIZE] = {" + dump.tensor_to_string(inp_copy) + "};\n"
-    )
-    f.close()
-
-    if bf16_format == 0:
-        inp = inp.half()
-    else:
-        inp = inp.bfloat16()
-    inp.requires_grad = True
-
-    # ~~~~~~~~~~ MANAGE INPUT WEIGHTS ~~~~~~~~~~
-    # Generate random input weights
-    in_wgt_init_tensor_q = torch.randn(att_dim, in_w)
-    in_wgt_init_tensor_k = torch.randn(att_dim, in_w)
-    in_wgt_init_tensor_v = torch.randn(att_dim, in_w)
-
-    in_bias_init_tensor_q = torch.randn(att_dim)
-    in_bias_init_tensor_k = torch.randn(att_dim)
-    in_bias_init_tensor_v = torch.randn(att_dim)
-
-    # Copy input weights to network
-    with torch.no_grad():
-        if bf16_format == 0:
-            net.mhsa.proj_q.weight.data = deepcopy(in_wgt_init_tensor_q.half())
-            net.mhsa.proj_k.weight.data = deepcopy(in_wgt_init_tensor_k.half())
-            net.mhsa.proj_v.weight.data = deepcopy(in_wgt_init_tensor_v.half())
-
-            net.mhsa.proj_q.bias.data = deepcopy(in_bias_init_tensor_q.half())
-            net.mhsa.proj_k.bias.data = deepcopy(in_bias_init_tensor_k.half())
-            net.mhsa.proj_v.bias.data = deepcopy(in_bias_init_tensor_v.half())
-        else:
-            net.mhsa.proj_q.weight.data = deepcopy(in_wgt_init_tensor_q.bfloat16())
-            net.mhsa.proj_k.weight.data = deepcopy(in_wgt_init_tensor_k.bfloat16())
-            net.mhsa.proj_v.weight.data = deepcopy(in_wgt_init_tensor_v.bfloat16())
-
-            net.mhsa.proj_q.bias.data = deepcopy(in_bias_init_tensor_q.bfloat16())
-            net.mhsa.proj_k.bias.data = deepcopy(in_bias_init_tensor_k.bfloat16())
-            net.mhsa.proj_v.bias.data = deepcopy(in_bias_init_tensor_v.bfloat16())
-
-    # Print input weights to terminal
-    print("Shape input weights:")
-    print(net.mhsa.proj_q.weight.shape)
-    print("Shape input biases:")
-    print(net.mhsa.proj_q.bias.shape)
-    print("q:")
-    print(net.mhsa.proj_q.weight.data)
-    print("k:")
-    print(net.mhsa.proj_k.weight.data)
-    print("v:")
-    print(net.mhsa.proj_v.weight.data)
-    print("\n")
-
-    # Write input weights to init file
-    f = open("init-defines.h", "a")
-    f.write("\n\n// Input Projections Weight Initialization\n")
-    f.write("#define INPUT_WGT_SIZE (" + str(in_wgt_init_tensor_q.numel()) + ")\n")
-    f.write(
-        "PI_L2 fp16 INPUT_WEIGHTS_Q[INPUT_WGT_SIZE] = {"
-        + dump.tensor_to_string(in_wgt_init_tensor_q.transpose(0, 1))
-        + "};\n"
-    )
-    f.write(
-        "PI_L2 fp16 INPUT_WEIGHTS_K[INPUT_WGT_SIZE] = {"
-        + dump.tensor_to_string(in_wgt_init_tensor_k.transpose(0, 1))
-        + "};\n"
-    )
-    f.write(
-        "PI_L2 fp16 INPUT_WEIGHTS_V[INPUT_WGT_SIZE] = {"
-        + dump.tensor_to_string(in_wgt_init_tensor_v.transpose(0, 1))
-        + "};\n"
-    )
-
-    f.write("\n\n// Input Projections Biases Initialization\n")
-    f.write("#define INPUT_BIAS_SIZE (" + str(in_bias_init_tensor_q.numel()) + ")\n")
-    f.write(
-        "PI_L2 fp16 INPUT_BIASES_Q[INPUT_BIAS_SIZE] = {"
-        + dump.tensor_to_string(in_bias_init_tensor_q)
-        + "};\n"
-    )
-    f.write(
-        "PI_L2 fp16 INPUT_BIASES_K[INPUT_BIAS_SIZE] = {"
-        + dump.tensor_to_string(in_bias_init_tensor_k)
-        + "};\n"
-    )
-    f.write(
-        "PI_L2 fp16 INPUT_BIASES_V[INPUT_BIAS_SIZE] = {"
-        + dump.tensor_to_string(in_bias_init_tensor_v)
-        + "};\n"
-    )
-    f.close()
-
-    # ~~~~~~~~~~ MANAGE OUTPUT WEIGHTS ~~~~~~~~~~
-    # Generate random output weights
-    output_proj_wgt_init_tensor = torch.randn(in_w, att_dim)
-
-    # Copy output weights to network
-    with torch.no_grad():
-        if bf16_format == 0:
-            net.mhsa.proj_out.weight.data = deepcopy(output_proj_wgt_init_tensor.half())
-        else:
-            net.mhsa.proj_out.weight.data = deepcopy(
-                output_proj_wgt_init_tensor.bfloat16()
-            )
-
-    # Print output weights to terminal
-    print("Shape output projection weights:")
-    print(net.mhsa.proj_out.weight.data.shape)
-    print(net.mhsa.proj_out.weight.data)
-    print("\n")
-
-    # Write output weights to init file
-    f = open("init-defines.h", "a")
-    f.write("\n\n")
-    f.write(
-        "#define OUTPUT_WGT_SIZE (" + str(output_proj_wgt_init_tensor.numel()) + ")\n"
-    )
-    f.write(
-        "PI_L2 fp16 OUTPUT_WEIGHTS[OUTPUT_WGT_SIZE] = {"
-        + dump.tensor_to_string(output_proj_wgt_init_tensor)
-        + "};\n"
-    )
-    f.close()
-
-    # ~~~~~~~~~~ COMPUTE OUTPUT ~~~~~~~~~~
-    if bf16_format == 0:
-        label = torch.ones(in_h, in_w).half()
-    else:
-        label = torch.ones(in_h, in_w).bfloat16()
-    criterion = nn.MSELoss()
-    out = net(x=inp, tgt_len=in_h)
-
-    # Print output to terminal
-    print("out: ")
-    print(out.size())
-    print(label.size())
-    print(out)
-
-    # Compute loss
-    loss = criterion(out.float(), label.float())
-
-    # Write output to file
-    out_copy = torch.transpose(out, -1, -2)
-
-    f = open("mhsa-output.h", "w")
-    f.write("#define OUTPUT_SIZE " + str(out.numel()) + "\n")
-    f.write(
-        "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {" + dump.tensor_to_string(out_copy) + "};\n"
-    )
-    f.close()
-
-    # Compute gradients
-    net.zero_grad()
-    loss.backward()
-
-    input_wgt_grad_q = net.mhsa.proj_q.weight.grad
-    input_wgt_grad_k = net.mhsa.proj_k.weight.grad
-    input_wgt_grad_v = net.mhsa.proj_v.weight.grad
-    output_wgt_grad = net.mhsa.proj_out.weight.grad
-    input_grad = inp.grad.transpose(1, 2)
-
-    # Write gradients to file
-    f = open("mhsa-grads.h", "a")
-
-    f.write("#define G_INPUT_WGT_SIZE " + str(input_wgt_grad_q.numel()) + "\n")
-    f.write(
-        "PI_L2 fp16 INPUT_WGT_GRAD_Q[G_INPUT_WGT_SIZE] = {"
-        + dump.tensor_to_string(input_wgt_grad_q)
-        + "};\n"
-    )
-    f.write(
-        "PI_L2 fp16 INPUT_WGT_GRAD_K[G_INPUT_WGT_SIZE] = {"
-        + dump.tensor_to_string(input_wgt_grad_k)
-        + "};\n"
-    )
-    f.write(
-        "PI_L2 fp16 INPUT_WGT_GRAD_V[G_INPUT_WGT_SIZE] = {"
-        + dump.tensor_to_string(input_wgt_grad_v)
-        + "};\n"
-    )
-
-    f.write("#define G_OUTPUT_WGT_SIZE " + str(output_wgt_grad.numel()) + "\n")
-    f.write(
-        "PI_L2 fp16 OUTPUT_WGT_GRAD[G_OUTPUT_WGT_SIZE] = {"
-        + dump.tensor_to_string(output_wgt_grad)
-        + "};\n"
-    )
-
-    f.write("#define G_IN_SIZE " + str(input_grad.numel()) + "\n")
-    f.write(
-        "PI_L2 fp16 INPUT_GRAD[G_IN_SIZE] = {"
-        + dump.tensor_to_string(input_grad)
-        + "};\n"
-    )
-
-    f.close()
-
-    # Write attention scores to file
-    f = open("attention_scores.h", "w")
-    f.write("#define ATTENTION_S_LENGTH " + str(net.mhsa.scores.numel()) + "\n")
-    f.write(
-        "PI_L2 fp16 ATTENTION_SCORES[ATTENTION_S_LENGTH] = {"
-        + dump.tensor_to_string(torch.transpose(net.mhsa.scores, 0, 1))
-        + "};\n"
-    )
-    f.close()
+"""
+Copyright (C) 2021-2022 ETH Zurich and University of Bologna
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Authors: Francesco Conoscenti (francesco.conoscenti@studio.unibo.it), Alberto Dequino (alberto.dequino@unibo.it),
+         Calin Diaconu (calin.diaconu@studio.unibo.it)
+"""
+
+import argparse
+from copy import deepcopy
+
+import numpy as np  # Matrix and vector computation package
+import torch
+import torch.nn as nn
+
+import dump_utils as dump
+import mhsa
+
+
+class MyNet(nn.Module):
+    # Define a simple network with a mhsa layer for testing
+    def __init__(self, in_w, n_heads, att_dim, bf16_format):
+        super().__init__()
+        self.mhsa = mhsa.MultiHeadedSelfAttention(
+            dim=in_w, num_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format
+        )
+
+    def forward(self, x, tgt_len):
+        return self.mhsa(x=x, tgt_len=tgt_len)
+
+
+def hook_fn1(_, __, o):
+    # Hook to write output gradients
+    f = open("mhsa-grads.h", "w")
+
+    print("------------Output Grad------------")
+    for grad in o:
+        try:
+            output_grad = torch.transpose(grad, 0, 1)
+            f.write("#define G_OUTPUT_SIZE " + str(output_grad.numel()) + "\n")
+            print(output_grad)
+
+            if current_step == "BACKWARD":
+                f.write(
+                    "PI_L2 fp16 OUTPUT_GRAD[G_OUTPUT_SIZE] = {"
+                    + dump.tensor_to_string(output_grad)
+                    + "};\n"
+                )
+            else:
+                f.write(
+                    "PI_L2 fp16 OUTPUT_GRAD[G_OUTPUT_SIZE] = {"
+                    + dump.tensor_to_string(output_grad)
+                    + "};\n"
+                )
+        except AttributeError:
+            print("None found for Gradient (output)")
+
+    f.close()
+
+
+def hook_fn2(_, __, o):
+    # Hook for writing output to file
+    cont = 0
+    f = open("mhsa-output.h", "w")
+
+    print("------------Output------------")
+    for grad in o:
+        try:
+            if cont == 0:
+                output_grad = grad
+                f.write("#define OUTPUT_SIZE " + str(output_grad.numel()) + "\n")
+
+                if bf16_format == 0:
+                    print(output_grad.half())
+                    f.write(
+                        "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {"
+                        + dump.tensor_to_string(output_grad)
+                        + "};\n"
+                    )
+                else:
+                    print(output_grad.bfloat16())
+                    f.write(
+                        "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {"
+                        + dump.tensor_to_string(output_grad)
+                        + "};\n"
+                    )
+
+            cont += 1
+        except AttributeError:
+            print("None found for Output")
+
+    f.close()
+
+
+if __name__ == "__main__":
+    # ~~~~~~~~~~ INTRO ~~~~~~~~~~
+    # Set the seed for reproducibility
+    np.random.seed(seed=1)  # <----- Sneed
+    torch.manual_seed(0)
+
+    # Visualize data with more precision
+    torch.set_printoptions(precision=10, sci_mode=False)
+
+    # Set up parser
+    parser = argparse.ArgumentParser("MHSA Layer Test")
+    parser.add_argument("--in_width", type=int, default=8)  # Token size
+    parser.add_argument("--in_height", type=int, default=4)  # Sequence length
+    parser.add_argument("--ch_in", type=int, default=1)
+    parser.add_argument("--ch_out", type=int, default=1)
+    parser.add_argument("--n_heads", type=int, default=8)
+    parser.add_argument("--weight", type=float, default=0.1)
+    parser.add_argument("--att_dim", type=int, default=8)
+    parser.add_argument(
+        "--bf16_format", type=int, default=1
+    )  # if == 1, data format if bfloat16, if 0 is float16
+    # Possible steps: FORWARD, BACKWARD_GRAD, BACKWARD_ERROR
+    parser.add_argument("--step", type=str, default="FORWARD")
+
+    args = parser.parse_args()
+
+    # Read arguments
+    in_h = args.in_height
+    in_w = args.in_width
+    ch_in = args.ch_in
+    ch_out = args.ch_out
+    n_heads = args.n_heads
+    current_step = args.step
+    weight_init = args.weight
+    att_dim = args.att_dim
+    head_dim = int(att_dim / n_heads)
+    bf16_format = args.bf16_format
+
+    # Write net step to file
+    f_step = open("step-check.h", "w")
+    f_step.write("#define " + str(current_step) + "\n")
+    f_step.close()
+
+    # Write input/output weights to file
+    f = open("init-defines.h", "w")
+
+    f.write("#define Tin_C_l1 " + str(ch_in) + "\n")
+    f.write("#define Tin_H_l1 " + str(in_h) + "\n")
+    f.write("#define Tin_W_l1 " + str(in_w) + "\n")
+    f.write("#define Tout_C_l1 " + str(ch_out) + "\n")
+    f.write("#define Tn_heads_l1 " + str(n_heads) + "\n")
+    f.write("#define Tatt_dim_l1 " + str(att_dim) + "\n")
+    f.write("#define Thead_dim_l1 " + str(head_dim) + "\n")
+    if current_step == "FORWARD":
+        f.write(
+            "#define Ttemp_max " + str(int(max(in_w * head_dim, in_h * head_dim, in_h * in_h, in_w * in_w))) + "\n"
+        )
+    else:
+        f.write(
+            "#define Ttemp_max "
+            + str(
+                int(max(in_h * att_dim, 3 * att_dim * in_w, in_h * in_h, in_h * in_w))
+            )
+            + "\n"
+        )
+
+    f.close()
+
+    # Define network and add hook
+    if bf16_format == 0:
+        net = MyNet(
+            in_w=in_w, n_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format
+        ).half()
+    elif bf16_format == 1:
+        net = MyNet(
+            in_w=in_w, n_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format
+        ).bfloat16()
+    net.zero_grad()
+
+    gradsRnn = net.mhsa.register_full_backward_hook(hook_fn1)
+
+    # ~~~~~~~~~~ MANAGE INPUT ~~~~~~~~~~
+    # Generate random input data
+    inp = torch.randn(ch_in, in_h, in_w)
+
+    # Print input data to terminal
+    print("------------Input sequence------------")
+    print(inp)
+
+    # Write transpose of input data to file
+    inp_copy = torch.transpose(inp, -1, -2)
+
+    f = open("input-sequence.h", "w")
+    f.write("#define INPUT_SIZE " + str(inp.numel()) + "\n")
+    f.write(
+        "PI_L2 fp16 INPUT[INPUT_SIZE] = {" + dump.tensor_to_string(inp_copy) + "};\n"
+    )
+    f.close()
+
+    if bf16_format == 0:
+        inp = inp.half()
+    else:
+        inp = inp.bfloat16()
+    inp.requires_grad = True
+
+    # ~~~~~~~~~~ MANAGE INPUT WEIGHTS ~~~~~~~~~~
+    # Generate random input weights
+    in_wgt_init_tensor_q = torch.randn(att_dim, in_w)
+    in_wgt_init_tensor_k = torch.randn(att_dim, in_w)
+    in_wgt_init_tensor_v = torch.randn(att_dim, in_w)
+
+    in_bias_init_tensor_q = torch.randn(att_dim)
+    in_bias_init_tensor_k = torch.randn(att_dim)
+    in_bias_init_tensor_v = torch.randn(att_dim)
+
+    # Copy input weights to network
+    with torch.no_grad():
+        if bf16_format == 0:
+            net.mhsa.proj_q.weight.data = deepcopy(in_wgt_init_tensor_q.half())
+            net.mhsa.proj_k.weight.data = deepcopy(in_wgt_init_tensor_k.half())
+            net.mhsa.proj_v.weight.data = deepcopy(in_wgt_init_tensor_v.half())
+
+            net.mhsa.proj_q.bias.data = deepcopy(in_bias_init_tensor_q.half())
+            net.mhsa.proj_k.bias.data = deepcopy(in_bias_init_tensor_k.half())
+            net.mhsa.proj_v.bias.data = deepcopy(in_bias_init_tensor_v.half())
+        else:
+            net.mhsa.proj_q.weight.data = deepcopy(in_wgt_init_tensor_q.bfloat16())
+            net.mhsa.proj_k.weight.data = deepcopy(in_wgt_init_tensor_k.bfloat16())
+            net.mhsa.proj_v.weight.data = deepcopy(in_wgt_init_tensor_v.bfloat16())
+
+            net.mhsa.proj_q.bias.data = deepcopy(in_bias_init_tensor_q.bfloat16())
+            net.mhsa.proj_k.bias.data = deepcopy(in_bias_init_tensor_k.bfloat16())
+            net.mhsa.proj_v.bias.data = deepcopy(in_bias_init_tensor_v.bfloat16())
+
+    # Print input weights to terminal
+    print("Shape input weights:")
+    print(net.mhsa.proj_q.weight.shape)
+    print("Shape input biases:")
+    print(net.mhsa.proj_q.bias.shape)
+    print("q:")
+    print(net.mhsa.proj_q.weight.data)
+    print("k:")
+    print(net.mhsa.proj_k.weight.data)
+    print("v:")
+    print(net.mhsa.proj_v.weight.data)
+    print("\n")
+
+    # Write input weights to init file
+    f = open("init-defines.h", "a")
+    f.write("\n\n// Input Projections Weight Initialization\n")
+    f.write("#define INPUT_WGT_SIZE (" + str(in_wgt_init_tensor_q.numel()) + ")\n")
+    f.write(
+        "PI_L2 fp16 INPUT_WEIGHTS_Q[INPUT_WGT_SIZE] = {"
+        + dump.tensor_to_string(in_wgt_init_tensor_q.transpose(0, 1))
+        + "};\n"
+    )
+    f.write(
+        "PI_L2 fp16 INPUT_WEIGHTS_K[INPUT_WGT_SIZE] = {"
+        + dump.tensor_to_string(in_wgt_init_tensor_k.transpose(0, 1))
+        + "};\n"
+    )
+    f.write(
+        "PI_L2 fp16 INPUT_WEIGHTS_V[INPUT_WGT_SIZE] = {"
+        + dump.tensor_to_string(in_wgt_init_tensor_v.transpose(0, 1))
+        + "};\n"
+    )
+
+    f.write("\n\n// Input Projections Biases Initialization\n")
+    f.write("#define INPUT_BIAS_SIZE (" + str(in_bias_init_tensor_q.numel()) + ")\n")
+    f.write(
+        "PI_L2 fp16 INPUT_BIASES_Q[INPUT_BIAS_SIZE] = {"
+        + dump.tensor_to_string(in_bias_init_tensor_q)
+        + "};\n"
+    )
+    f.write(
+        "PI_L2 fp16 INPUT_BIASES_K[INPUT_BIAS_SIZE] = {"
+        + dump.tensor_to_string(in_bias_init_tensor_k)
+        + "};\n"
+    )
+    f.write(
+        "PI_L2 fp16 INPUT_BIASES_V[INPUT_BIAS_SIZE] = {"
+        + dump.tensor_to_string(in_bias_init_tensor_v)
+        + "};\n"
+    )
+    f.close()
+
+    # ~~~~~~~~~~ MANAGE OUTPUT WEIGHTS ~~~~~~~~~~
+    # Generate random output weights
+    output_proj_wgt_init_tensor = torch.randn(in_w, att_dim)
+
+    # Copy output weights to network
+    with torch.no_grad():
+        if bf16_format == 0:
+            net.mhsa.proj_out.weight.data = deepcopy(output_proj_wgt_init_tensor.half())
+        else:
+            net.mhsa.proj_out.weight.data = deepcopy(
+                output_proj_wgt_init_tensor.bfloat16()
+            )
+
+    # Print output weights to terminal
+    print("Shape output projection weights:")
+    print(net.mhsa.proj_out.weight.data.shape)
+    print(net.mhsa.proj_out.weight.data)
+    print("\n")
+
+    # Write output weights to init file
+    f = open("init-defines.h", "a")
+    f.write("\n\n")
+    f.write(
+        "#define OUTPUT_WGT_SIZE (" + str(output_proj_wgt_init_tensor.numel()) + ")\n"
+    )
+    f.write(
+        "PI_L2 fp16 OUTPUT_WEIGHTS[OUTPUT_WGT_SIZE] = {"
+        + dump.tensor_to_string(output_proj_wgt_init_tensor)
+        + "};\n"
+    )
+    f.close()
+
+    # ~~~~~~~~~~ COMPUTE OUTPUT ~~~~~~~~~~
+    if bf16_format == 0:
+        label = torch.ones(in_h, in_w).half()
+    else:
+        label = torch.ones(in_h, in_w).bfloat16()
+    criterion = nn.MSELoss()
+    out = net(x=inp, tgt_len=in_h)
+
+    # Print output to terminal
+    print("out: ")
+    print(out.size())
+    print(label.size())
+    print(out)
+
+    # Compute loss
+    loss = criterion(out.float(), label.float())
+
+    # Write output to file
+    out_copy = torch.transpose(out, -1, -2)
+
+    f = open("mhsa-output.h", "w")
+    f.write("#define OUTPUT_SIZE " + str(out.numel()) + "\n")
+    f.write(
+        "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {" + dump.tensor_to_string(out_copy) + "};\n"
+    )
+    f.close()
+
+    # Compute gradients
+    net.zero_grad()
+    loss.backward()
+
+    input_wgt_grad_q = net.mhsa.proj_q.weight.grad
+    input_wgt_grad_k = net.mhsa.proj_k.weight.grad
+    input_wgt_grad_v = net.mhsa.proj_v.weight.grad
+    output_wgt_grad = net.mhsa.proj_out.weight.grad
+    input_grad = inp.grad.transpose(1, 2)
+
+    # Write gradients to file
+    f = open("mhsa-grads.h", "a")
+
+    f.write("#define G_INPUT_WGT_SIZE " + str(input_wgt_grad_q.numel()) + "\n")
+    f.write(
+        "PI_L2 fp16 INPUT_WGT_GRAD_Q[G_INPUT_WGT_SIZE] = {"
+        + dump.tensor_to_string(input_wgt_grad_q)
+        + "};\n"
+    )
+    f.write(
+        "PI_L2 fp16 INPUT_WGT_GRAD_K[G_INPUT_WGT_SIZE] = {"
+        + dump.tensor_to_string(input_wgt_grad_k)
+        + "};\n"
+    )
+    f.write(
+        "PI_L2 fp16 INPUT_WGT_GRAD_V[G_INPUT_WGT_SIZE] = {"
+        + dump.tensor_to_string(input_wgt_grad_v)
+        + "};\n"
+    )
+
+    f.write("#define G_OUTPUT_WGT_SIZE " + str(output_wgt_grad.numel()) + "\n")
+    f.write(
+        "PI_L2 fp16 OUTPUT_WGT_GRAD[G_OUTPUT_WGT_SIZE] = {"
+        + dump.tensor_to_string(output_wgt_grad)
+        + "};\n"
+    )
+
+    f.write("#define G_IN_SIZE " + str(input_grad.numel()) + "\n")
+    f.write(
+        "PI_L2 fp16 INPUT_GRAD[G_IN_SIZE] = {"
+        + dump.tensor_to_string(input_grad)
+        + "};\n"
+    )
+
+    f.close()
+
+    # Write attention scores to file
+    f = open("attention_scores.h", "w")
+    f.write("#define ATTENTION_S_LENGTH " + str(net.mhsa.scores.numel()) + "\n")
+    f.write(
+        "PI_L2 fp16 ATTENTION_SCORES[ATTENTION_S_LENGTH] = {"
+        + dump.tensor_to_string(torch.transpose(net.mhsa.scores, 0, 1))
+        + "};\n"
+    )
+    f.close()
diff --git a/tests/test_mhsa_fp32/utils/GM.py b/tests/test_mhsa_fp32/utils/GM.py
index 33ddd070..19d22e99 100644
--- a/tests/test_mhsa_fp32/utils/GM.py
+++ b/tests/test_mhsa_fp32/utils/GM.py
@@ -142,7 +142,7 @@ def hook_fn2(_, __, o):
     f.write("#define Thead_dim_l1 " + str(head_dim) + "\n")
     if current_step == "FORWARD":
         f.write(
-            "#define Ttemp_max " + str(int(max(in_h * head_dim, in_h * in_h))) + "\n"
+            "#define Ttemp_max " + str(int(max(in_w * head_dim, in_h * head_dim, in_h * in_h, in_w * in_w))) + "\n"
         )
     else:
         f.write(
diff --git a/tests/test_mhsa_paper_fp16/Makefile b/tests/test_mhsa_paper_fp16/Makefile
index a2977183..f6acbbea 100644
--- a/tests/test_mhsa_paper_fp16/Makefile
+++ b/tests/test_mhsa_paper_fp16/Makefile
@@ -49,7 +49,7 @@ NUM_MATMULS?=24		# When profiling with multiple matmul algorithms
 NUM_SIZES?=3		# When profiling multiple sizes of the network
 # End of user settings
 
-TRAIN_LIB=/home/alberto/pulp-trainlib/lib
+TRAIN_LIB=../../lib
 TRAIN_LIB_SRCS=$(TRAIN_LIB)/sources
 APP_SRCS = main.c net_l1.c
 
diff --git a/tests/test_residual/Makefile b/tests/test_residual/Makefile
index 1a9651c8..74dfbb55 100644
--- a/tests/test_residual/Makefile
+++ b/tests/test_residual/Makefile
@@ -1,8 +1,8 @@
 APP = test_residual
 
-CI?=64
-HI?=56
-WI?=56
+CI?=8
+HI?=4
+WI?=6
 KER?=1
 NUM_CORES?=8
 HWC?=0
@@ -20,6 +20,8 @@ APP_SRCS += main.c net.c
 
 APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv2d_fp32.c
 APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv2d_fp16.c
+APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv_naive_fp32.c
+APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv_naive_fp16.c
 APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_train_utils_fp32.c
 APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_train_utils_fp16.c
 APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_losses_fp32.c
diff --git a/tests/test_tiny_vit_fp32/Makefile b/tests/test_tiny_vit_fp32/Makefile
index 909ffd7e..bf4b8542 100644
--- a/tests/test_tiny_vit_fp32/Makefile
+++ b/tests/test_tiny_vit_fp32/Makefile
@@ -5,7 +5,8 @@ NUM_CORES = 8
 MATMUL_TYPE?=9
 DATA_TYPE?=32
 
-CONFIG_NAME = "TINY_VIT_5M"
+CONFIG_NAME = "DEMO_TINY_VIT_CONFIG"
+# CONFIG_NAME = "TINY_VIT_5M"
 # End of user code
 
 TASK_NAME=sst-2