From 0fab1bda8885f767acb7504aca7ee21f0cd1f0ff Mon Sep 17 00:00:00 2001 From: diaco Date: Tue, 13 May 2025 17:31:04 +0200 Subject: [PATCH 1/9] Fixed crashes in mhsa fp16 due to new transposition. Test not passing due to linear layer issues --- lib/sources/pulp_mhsa_fp16.c | 139 +++++++++++++++++++--------------- tests/test_mhsa_fp16/Makefile | 8 +- 2 files changed, 80 insertions(+), 67 deletions(-) diff --git a/lib/sources/pulp_mhsa_fp16.c b/lib/sources/pulp_mhsa_fp16.c index e7b62e8a..7fe5e4e8 100644 --- a/lib/sources/pulp_mhsa_fp16.c +++ b/lib/sources/pulp_mhsa_fp16.c @@ -76,8 +76,8 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { // T0_q struct transp_args_fp16 transp_args0_q; - int dim[] = {E, F}; - int tr_axes[] = {1, 0}; + int dim[2] = {E, F}; + int tr_axes[2] = {1, 0}; transp_args0_q.in_matrix = coeffDataWinQ; transp_args0_q.out_matrix = temp; @@ -91,14 +91,14 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { printf("\n\n\nT0_q result\n\ncoeffDataWinQ [^T]: %d %d\n", E, F); for (int j=0; jmatrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 2 * tile_dim, 2 * M, 2 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load); + pi_cl_dma_cmd_2d((uint32_t) (args->in_matrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 2 * tile_dim, 2 * M, 2 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load); pi_cl_dma_cmd_wait(cmd_load); pi_cl_team_fork(NUM_CORES, transpose_fp16, &args_l1); - pi_cl_dma_cmd_2d((uint32_t) (args->transp_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 2 * tile_dim, 2 * N, 2 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store); + pi_cl_dma_cmd_2d((uint32_t) (args->out_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 2 * tile_dim, 2 * N, 2 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store); pi_cl_dma_cmd_wait(cmd_store); } } @@ -2225,8 +2238,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ for (int i = 0; i < n_heads; i++) { // T1 struct transp_args_fp16 transp_args1; - transp_args1.matrix = kt + L * i * H; - transp_args1.transp_matrix = temp; + transp_args1.in_matrix = kt + L * i * H; + transp_args1.out_matrix = temp; transp_args1.N = H; transp_args1.M = L; @@ -2279,8 +2292,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // row-wise max and sums, therefore it is necessary to transpose the current head buffer. // T2 struct transp_args_fp16 transp_args2; - transp_args2.matrix = softmax_buffer + i * L * L; - transp_args2.transp_matrix = temp; + transp_args2.in_matrix = softmax_buffer + i * L * L; + transp_args2.out_matrix = temp; transp_args2.N = L; transp_args2.M = L; @@ -2329,8 +2342,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // Each head result has to be appended to the full attention map, to do so we require to store the current // softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again. struct transp_args_fp16 transp_args3; - transp_args3.matrix = softmax_buffer + i * L * L; - transp_args3.transp_matrix = temp; + transp_args3.in_matrix = softmax_buffer + i * L * L; + transp_args3.out_matrix = temp; transp_args3.N = L; transp_args3.M = L; @@ -2376,8 +2389,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // T4 // The last transpose to original shape struct transp_args_fp16 transp_args4; - transp_args4.matrix = temp; - transp_args4.transp_matrix = outData; + transp_args4.in_matrix = temp; + transp_args4.out_matrix = outData; transp_args4.N = mhsa_args->input_bn->W; transp_args4.M = L; diff --git a/tests/test_mhsa_fp16/Makefile b/tests/test_mhsa_fp16/Makefile index 0b4acd74..107907a1 100644 --- a/tests/test_mhsa_fp16/Makefile +++ b/tests/test_mhsa_fp16/Makefile @@ -1,10 +1,10 @@ APP = mhsa_fp16 # User settings -IN_H?=196 # Sequence Length -IN_W?=160 # Token Size -N_HEADS?=5 -ATT_DIM?=160 #Hidden dimension +IN_H?=20 # Sequence Length +IN_W?=40 # Token Size +N_HEADS?=2 +ATT_DIM?=40 #Hidden dimension IN_CH?=1 OUT_CH?=1 From 11dbbf6ae3acaa4f98803ef185b029c87b2ebddd Mon Sep 17 00:00:00 2001 From: diaco Date: Tue, 13 May 2025 17:59:06 +0200 Subject: [PATCH 2/9] Fixed mhsa fp32 kernel issues --- lib/sources/pulp_mhsa_fp32.c | 179 ++++++++++++++++++----------------- 1 file changed, 92 insertions(+), 87 deletions(-) diff --git a/lib/sources/pulp_mhsa_fp32.c b/lib/sources/pulp_mhsa_fp32.c index 8bb70edf..b4147044 100644 --- a/lib/sources/pulp_mhsa_fp32.c +++ b/lib/sources/pulp_mhsa_fp32.c @@ -75,7 +75,7 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) { // T0_q int dims[] = {E, F}; - int t_axes = {1, 0}; + int t_axes[] = {1, 0}; struct transp_args transp_args0_q; transp_args0_q.in_matrix = coeffDataWinQ; @@ -90,14 +90,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) { printf("\n\n\nT0_q result\n\ncoeffDataWinQ [^T]: %d %d\n", E, F); for (int j=0; j L x L ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // T1 - dims = {H, L}; + dims[0] = H; + dims[1] = L; struct transp_args transp_args1; @@ -351,14 +354,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) { printf("\n\n\nHead %d - T1 result\n\nk: %d %d\n", i, H, L); for (int j=0; j H x L ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // T6 - dims[] = {H, L}; + dims[0] = H; + dims[1] = L; struct transp_args transp_args6; @@ -1229,14 +1239,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) { printf("\n\n\nHead %d - T6 result\n\nq: %d %d\n", i, H, L); for (int j=0; j E x L ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // T8 - dims[] = {E, L}; + dims[0] = E; + dims[1] = L; struct transp_args transp_args8; @@ -1439,14 +1451,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) { printf("\n\n\nT8 result\n\ninputData: %d %d\n", E, L); for (int j=0; jmatrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 4 * tile_dim, 4 * M, 4 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load); + pi_cl_dma_cmd_2d((uint32_t) (args->in_matrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 4 * tile_dim, 4 * M, 4 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load); pi_cl_dma_cmd_wait(cmd_load); pi_cl_team_fork(NUM_CORES, transpose, &args_l1); - pi_cl_dma_cmd_2d((uint32_t) (args->transp_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 4 * tile_dim, 4 * N, 4 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store); + pi_cl_dma_cmd_2d((uint32_t) (args->out_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 4 * tile_dim, 4 * N, 4 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store); pi_cl_dma_cmd_wait(cmd_store); } } @@ -2248,8 +2253,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ for (int i = 0; i < n_heads; i++) { // T1 struct transp_args transp_args1; - transp_args1.matrix = kt + L * i * H; - transp_args1.transp_matrix = temp; + transp_args1.in_matrix = kt + L * i * H; + transp_args1.out_matrix = temp; transp_args1.N = H; transp_args1.M = L; @@ -2303,8 +2308,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // row-wise max and sums, therefore it is necessary to transpose the current head buffer. // T2 struct transp_args transp_args2; - transp_args2.matrix = softmax_buffer + i * L * L; - transp_args2.transp_matrix = temp; + transp_args2.in_matrix = softmax_buffer + i * L * L; + transp_args2.out_matrix = temp; transp_args2.N = L; transp_args2.M = L; @@ -2354,8 +2359,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // Each head result has to be appended to the full attention map, to do so we require to store the current // softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again. struct transp_args transp_args3; - transp_args3.matrix = softmax_buffer + i * L * L; - transp_args3.transp_matrix = temp; + transp_args3.in_matrix = softmax_buffer + i * L * L; + transp_args3.out_matrix = temp; transp_args3.N = L; transp_args3.M = L; @@ -2401,8 +2406,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // T4 // The last transpose to original shape struct transp_args transp_args4; - transp_args4.matrix = temp; - transp_args4.transp_matrix = outData; + transp_args4.in_matrix = temp; + transp_args4.out_matrix = outData; transp_args4.N = mhsa_args->input_bn->W; transp_args4.M = L; @@ -2460,8 +2465,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T1 struct transp_args transp_args1; - transp_args1.matrix = attention_map; - transp_args1.transp_matrix = temp; + transp_args1.in_matrix = attention_map; + transp_args1.out_matrix = temp; transp_args1.N = F; transp_args1.M = L; @@ -2491,8 +2496,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T2 struct transp_args transp_args2; - transp_args2.matrix = coeffDataWout; - transp_args2.transp_matrix = temp; + transp_args2.in_matrix = coeffDataWout; + transp_args2.out_matrix = temp; transp_args2.N = E; transp_args2.M = F; @@ -2546,8 +2551,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T3 struct transp_args transp_args3; - transp_args3.matrix = v + i * L * H; - transp_args3.transp_matrix = temp; + transp_args3.in_matrix = v + i * L * H; + transp_args3.out_matrix = temp; transp_args3.N = H; transp_args3.M = L; @@ -2601,8 +2606,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ for (int i = 0; i < n_heads; i++) { // T4 struct transp_args transp_args4; - transp_args4.matrix = softmax_buffer_diff + i * L * L; - transp_args4.transp_matrix = temp; + transp_args4.in_matrix = softmax_buffer_diff + i * L * L; + transp_args4.out_matrix = temp; transp_args4.N = L; transp_args4.M = L; @@ -2621,8 +2626,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T5 struct transp_args transp_args5; - transp_args5.matrix = grad; - transp_args5.transp_matrix = temp; + transp_args5.in_matrix = grad; + transp_args5.out_matrix = temp; transp_args5.N = L; transp_args5.M = L; @@ -2645,8 +2650,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T6 struct transp_args transp_args6; - transp_args6.matrix = q + i * L * H; - transp_args6.transp_matrix = temp; + transp_args6.in_matrix = q + i * L * H; + transp_args6.out_matrix = temp; transp_args6.N = H; transp_args6.M = L; @@ -2676,8 +2681,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T7 struct transp_args transp_args7; - transp_args7.matrix = k_diff + i * L * H; - transp_args7.transp_matrix = temp; + transp_args7.in_matrix = k_diff + i * L * H; + transp_args7.out_matrix = temp; transp_args7.N = L; transp_args7.M = H; From c9deb9652a97b406d3bf7a5562d6a7fb77959a83 Mon Sep 17 00:00:00 2001 From: diaco Date: Tue, 13 May 2025 18:33:52 +0200 Subject: [PATCH 3/9] Fixed problems introduced into the residual test --- lib/sources/pulp_residual_fp16.c | 23 +++++------------------ tests/test_residual/Makefile | 8 +++++--- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/lib/sources/pulp_residual_fp16.c b/lib/sources/pulp_residual_fp16.c index ba0917b8..72d033c0 100644 --- a/lib/sources/pulp_residual_fp16.c +++ b/lib/sources/pulp_residual_fp16.c @@ -41,20 +41,14 @@ void pulp_residualconn_fp16_fw(void *SkipConn_args_fp16) { return; } - int dims[] = {out->dim}; - struct vect_sum_args_fp16 args_sum; + args_sum.op_1 = skip->data; args_sum.op_2 = lout->data; args_sum.dest = out->data; + args_sum.size = out->dim; - args_sum.op_1_dims = dims; - args_sum.op_2_dims = dims; - - args_sum.op_1_dims_len = 1; - args_sum.op_2_dims_len = 1; - - pi_cl_team_fork(NUM_CORES, array_broadcast_sum_fp16, &args_sum); + pi_cl_team_fork(NUM_CORES, vect_sum_fp16, &args_sum); } @@ -77,21 +71,14 @@ void pulp_sumnode_fp16_bw(void *SkipConn_args_fp16) { return; } - int dims[] = {skip->dim}; - struct vect_sum_args_fp16 args_sum; args_sum.op_1 = out->diff; args_sum.op_2 = skip->diff; args_sum.dest = skip->diff; + args_sum.size = skip->dim; - args_sum.op_1_dims = dims; - args_sum.op2_dims = dims; - - args_sum.op_1_dims_len = 1; - args_sum.op_2_dims_len = 1; - - pi_cl_team_fork(NUM_CORES, array_broadcast_sum_fp16, &args_sum); + pi_cl_team_fork(NUM_CORES, vect_sum_fp16, &args_sum); } } diff --git a/tests/test_residual/Makefile b/tests/test_residual/Makefile index 1a9651c8..74dfbb55 100644 --- a/tests/test_residual/Makefile +++ b/tests/test_residual/Makefile @@ -1,8 +1,8 @@ APP = test_residual -CI?=64 -HI?=56 -WI?=56 +CI?=8 +HI?=4 +WI?=6 KER?=1 NUM_CORES?=8 HWC?=0 @@ -20,6 +20,8 @@ APP_SRCS += main.c net.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv2d_fp32.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv2d_fp16.c +APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv_naive_fp32.c +APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv_naive_fp16.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_train_utils_fp32.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_train_utils_fp16.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_losses_fp32.c From f4c94f4010fea39cabd015262ef0dedaa6d882e3 Mon Sep 17 00:00:00 2001 From: diaco Date: Tue, 13 May 2025 18:39:11 +0200 Subject: [PATCH 4/9] Fixed issues introduced in rnn kernel --- lib/sources/pulp_rnn_fp32.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/sources/pulp_rnn_fp32.c b/lib/sources/pulp_rnn_fp32.c index 8852a715..a8b9ce43 100644 --- a/lib/sources/pulp_rnn_fp32.c +++ b/lib/sources/pulp_rnn_fp32.c @@ -244,13 +244,13 @@ void pulp_rnn_fp32_bw_cl(void *Rnn_args) { // Calculate gradient for State Weights // Transpose State - int dims[] = {N, M}; - int t_axes[] = {1, 0}; + dims[0] = N; + dims[1] = M; struct transp_args transp_args2; - transp_args2.matrix = hiddState; - transp_args2.transp_matrix = temp; + transp_args2.in_matrix = hiddState; + transp_args2.out_matrix = temp; transp_args2.dim = dims; transp_args2.transposed_axes = t_axes; transp_args2.n_dim = 2; @@ -301,7 +301,8 @@ void pulp_rnn_fp32_bw_cl(void *Rnn_args) { // Calculate the Gradient of the Input // Transpose Input Weights - dims = {K, M}; + dims[0] = K; + dims[1] = M; struct transp_args transp_args3; From 95b84280b4b9b3d79f1bf61b1bdae38da60b40b3 Mon Sep 17 00:00:00 2001 From: diaco Date: Tue, 13 May 2025 18:41:26 +0200 Subject: [PATCH 5/9] Made default test demo model for tiny-vit --- tests/test_tiny_vit_fp32/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_tiny_vit_fp32/Makefile b/tests/test_tiny_vit_fp32/Makefile index 909ffd7e..bf4b8542 100644 --- a/tests/test_tiny_vit_fp32/Makefile +++ b/tests/test_tiny_vit_fp32/Makefile @@ -5,7 +5,8 @@ NUM_CORES = 8 MATMUL_TYPE?=9 DATA_TYPE?=32 -CONFIG_NAME = "TINY_VIT_5M" +CONFIG_NAME = "DEMO_TINY_VIT_CONFIG" +# CONFIG_NAME = "TINY_VIT_5M" # End of user code TASK_NAME=sst-2 From 31cdf0611802125ba12a3a570c2af7f4795c4139 Mon Sep 17 00:00:00 2001 From: diaco Date: Wed, 14 May 2025 15:09:55 +0200 Subject: [PATCH 6/9] Added necessary "USE_BIAS = 0" to the MHSA fp16 test --- lib/sources/pulp_mhsa_fp16.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/lib/sources/pulp_mhsa_fp16.c b/lib/sources/pulp_mhsa_fp16.c index 7fe5e4e8..70719d47 100644 --- a/lib/sources/pulp_mhsa_fp16.c +++ b/lib/sources/pulp_mhsa_fp16.c @@ -113,6 +113,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { matMul_args1_q.K = E; matMul_args1_q.M = L; matMul_args1_q.trans_B = 0; + matMul_args1_q.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args1_q); @@ -197,6 +198,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { matMul_args1_k.K = E; matMul_args1_k.M = L; matMul_args1_k.trans_B = 0; + matMul_args1_k.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args1_k); @@ -281,6 +283,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { matMul_args1_v.K = E; matMul_args1_v.M = L; matMul_args1_v.trans_B = 0; + matMul_args1_v.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args1_v); @@ -375,6 +378,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { matMul_args2.K = H; matMul_args2.M = L; matMul_args2.trans_B = 0; + matMul_args2.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args2); @@ -557,6 +561,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { matMul_args3.K = L; matMul_args3.M = L; matMul_args3.trans_B = 0; + matMul_args3.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args3); @@ -609,6 +614,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { matMul_args4.K = F; matMul_args4.M = L; matMul_args4.trans_B = 0; + matMul_args4.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args4); @@ -748,6 +754,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args1.K = L; matMul_args1.M = F; matMul_args1.trans_B = 0; + matMul_args1.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args1); @@ -822,6 +829,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args2.K = E; matMul_args2.M = L; matMul_args2.trans_B = 0; + matMul_args2.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args2); @@ -889,6 +897,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args3.K = L; matMul_args3.M = L; matMul_args3.trans_B = 0; + matMul_args3.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args3); @@ -963,6 +972,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args4.K = H; matMul_args4.M = L; matMul_args4.trans_B = 0; + matMul_args4.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args4); @@ -1251,6 +1261,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args5.K = L; matMul_args5.M = H; matMul_args5.trans_B = 0; + matMul_args5.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args5); @@ -1349,6 +1360,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args6.K = L; matMul_args6.M = L; matMul_args6.trans_B = 0; + matMul_args6.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args6); @@ -1461,6 +1473,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args7_q.K = L; matMul_args7_q.M = E; matMul_args7_q.trans_B = 0; + matMul_args7_q.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args7_q); @@ -1505,6 +1518,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args7_k.K = L; matMul_args7_k.M = E; matMul_args7_k.trans_B = 0; + matMul_args7_k.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args7_k); @@ -1549,6 +1563,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args7_v.K = L; matMul_args7_v.M = E; matMul_args7_v.trans_B = 0; + matMul_args7_v.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args7_v); @@ -1593,6 +1608,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args8_q.K = F; matMul_args8_q.M = L; matMul_args8_q.trans_B = 0; + matMul_args8_q.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args8_q); @@ -1654,6 +1670,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args8_k.K = F; matMul_args8_k.M = L; matMul_args8_k.trans_B = 0; + matMul_args8_k.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args8_k); @@ -1701,6 +1718,7 @@ void pulp_mhsa_fp16_bw_cl(void *Mhsa_args) { matMul_args8_v.K = F; matMul_args8_v.M = L; matMul_args8_v.trans_B = 0; + matMul_args8_v.USE_BIASES = 0; #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args8_v); From 4e4a7d02b59a5dad4a593d2a96b13df0be9a1af9 Mon Sep 17 00:00:00 2001 From: diaco Date: Wed, 14 May 2025 19:17:01 +0200 Subject: [PATCH 7/9] Fixed mhsa fp16 test issues (memory allocation and error check to hexa) --- lib/sources/pulp_mhsa_fp16.c | 104 ++-- tests/test_matmul/utils/GM.py | 31 +- tests/test_mhsa_fp16/net.h | 4 +- tests/test_mhsa_fp16/utils/GM.py | 804 ++++++++++++++-------------- tests/test_mhsa_fp32/utils/GM.py | 2 +- tests/test_mhsa_paper_fp16/Makefile | 2 +- 6 files changed, 488 insertions(+), 459 deletions(-) diff --git a/lib/sources/pulp_mhsa_fp16.c b/lib/sources/pulp_mhsa_fp16.c index 70719d47..45c0aacd 100644 --- a/lib/sources/pulp_mhsa_fp16.c +++ b/lib/sources/pulp_mhsa_fp16.c @@ -34,7 +34,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { fp16 *inputData = mhsa_args->input->data; // Input vector (Transposed, E x L) fp16 *outData = mhsa_args->output->data; // Output sequence (Transposed, E x L) fp16 *temp = mhsa_args->temp_buffer; // Support buffer used in the attention head loop - fp16 *softmax_buffer = mhsa_args->softmax_buffer->data; // Buffer containing the softmax results (necessary to save for backward pass) + fp16 *softmax_buffer = mhsa_args->softmax_buffer->data; // Buffer containing the softmax results (necessary to save for backward pass); TODO: Only save in its entirety for bw; otherwise just for the respective head part fp16 *maxes = mhsa_args->maxes; // Buffer containing the row-wise maxes in the softmax process fp16 *sums = mhsa_args->sums; // Buffer containing the row-wise exponential sums in the softmax process fp16 *q = mhsa_args->q->data; // Pointer to the first element of Q @@ -48,9 +48,9 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { int E = mhsa_args->input->W; // Input Sequence element size int F = mhsa_args->attention_map->W; // Hidden dimension of attention (N. Heads * Head dimension) -#ifdef DEBUG + #ifdef DEBUG printf("\n~~~~~~~~~~~~~~~FORWARD PASS~~~~~~~~~~~~~~~\n\nPrinting the parameters: L-%d, E-%d, F-%d", L, E, F); -#endif + #endif int H = F / n_heads; // Head dimension fp16 scaling = (fp16)(1 / sqrt(H)); // Scaling factor to avoid vanishing gradients @@ -87,7 +87,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { pi_cl_team_fork(NUM_CORES, transpose_fp16, &transp_args0_q); -#ifdef DEBUG + #ifdef DEBUG printf("\n\n\nT0_q result\n\ncoeffDataWinQ [^T]: %d %d\n", E, F); for (int j=0; j F ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -616,18 +616,18 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { matMul_args4.trans_B = 0; matMul_args4.USE_BIASES = 0; -#ifndef OPTIMIZE + #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args4); -#else + #else struct mm_manager_args_fp16 man_args4; man_args4.mm_args = &matMul_args4; man_args4.layer_type = LAYER_LINEAR; man_args4.step_type = STEP_FW; man_args4.matmul_type = opt_matmul_type; //MATMUL_TYPE pi_cl_team_fork(NUM_CORES, mm_manager_fp16, &man_args4); -#endif + #endif -#ifdef DEBUG + #ifdef DEBUG printf("\n\n\nM4 result\n\ncoeffDataWout: %d %d\n", E, F); for (int j=0; j Date: Mon, 26 May 2025 14:50:14 +0200 Subject: [PATCH 8/9] Add model shape inferencing --- tests/test_tiny_vit_fp32/utils/GM.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_tiny_vit_fp32/utils/GM.py b/tests/test_tiny_vit_fp32/utils/GM.py index cfc58f7e..b5f32ac0 100644 --- a/tests/test_tiny_vit_fp32/utils/GM.py +++ b/tests/test_tiny_vit_fp32/utils/GM.py @@ -131,6 +131,14 @@ def onnx_parser(onnx_model): used_data, all_used_data, all_elements, adapt_onnx_name(node.name) ) + if node.op_type == "Transpose": + new_shape = list(all_elements[node.output[0]]["shape"]) + + for reshape_output in onnx_model.graph.value_info: + if reshape_output.name == node.output[0]: + for iii, d in enumerate(reshape_output.type.tensor_type.shape.dim): + d.dim_value = new_shape[iii] + elif node.op_type == "Identity": the_data = all_elements[node.input[0]]["val"] @@ -225,6 +233,11 @@ def onnx_parser(onnx_model): "shape": tuple(new_shape), "data": all_elements[node.input[0]]["data"], } + + for reshape_output in onnx_model.graph.value_info: + if reshape_output.name == node.output[0]: + for iii, d in enumerate(reshape_output.type.tensor_type.shape.dim): + d.dim_value = new_shape[iii] else: raise NotImplementedError( f"Operation {node.op_type} is not implemented in the parser." @@ -354,6 +367,12 @@ def main(): training=torch.onnx.TrainingMode.EVAL, export_params=True, ) + + # Infer node output dimensions + onnx_model = onnx.load("TinyViT.onnx") + onnx_model = onnx.shape_inference.infer_shapes(onnx_model) + + onnx.save_model(onnx_model, "TinyViT.onnx") onnx_model = onnx.load("TinyViT.onnx") # Parse onnx @@ -367,6 +386,17 @@ def main(): input_name, ) = onnx_parser(onnx_model) + inputs_to_save = [ + sample_input.numpy(), + ] + list(parameter_arrays.values()) + + np.savez("inputs.npz", *inputs_to_save) + np.savez("outputs.npz", input=model(sample_input).detach()) + + onnx.save_model(onnx_model, "TinyViT.onnx") + + input("Stop if only exporting ONNX model.") + # Write input sequence input_writer( file_root_dir=root_dir, input_name=input_name, input_array=sample_input From 569b08a7aa7aeb26f676a9fda748b89c25f2f758 Mon Sep 17 00:00:00 2001 From: diaco Date: Mon, 26 May 2025 14:53:22 +0200 Subject: [PATCH 9/9] Revert "Add model shape inferencing" This reverts commit c457faa139249b598bc6df9c549a84d2a355bb1f. --- tests/test_tiny_vit_fp32/utils/GM.py | 30 ---------------------------- 1 file changed, 30 deletions(-) diff --git a/tests/test_tiny_vit_fp32/utils/GM.py b/tests/test_tiny_vit_fp32/utils/GM.py index b5f32ac0..cfc58f7e 100644 --- a/tests/test_tiny_vit_fp32/utils/GM.py +++ b/tests/test_tiny_vit_fp32/utils/GM.py @@ -131,14 +131,6 @@ def onnx_parser(onnx_model): used_data, all_used_data, all_elements, adapt_onnx_name(node.name) ) - if node.op_type == "Transpose": - new_shape = list(all_elements[node.output[0]]["shape"]) - - for reshape_output in onnx_model.graph.value_info: - if reshape_output.name == node.output[0]: - for iii, d in enumerate(reshape_output.type.tensor_type.shape.dim): - d.dim_value = new_shape[iii] - elif node.op_type == "Identity": the_data = all_elements[node.input[0]]["val"] @@ -233,11 +225,6 @@ def onnx_parser(onnx_model): "shape": tuple(new_shape), "data": all_elements[node.input[0]]["data"], } - - for reshape_output in onnx_model.graph.value_info: - if reshape_output.name == node.output[0]: - for iii, d in enumerate(reshape_output.type.tensor_type.shape.dim): - d.dim_value = new_shape[iii] else: raise NotImplementedError( f"Operation {node.op_type} is not implemented in the parser." @@ -367,12 +354,6 @@ def main(): training=torch.onnx.TrainingMode.EVAL, export_params=True, ) - - # Infer node output dimensions - onnx_model = onnx.load("TinyViT.onnx") - onnx_model = onnx.shape_inference.infer_shapes(onnx_model) - - onnx.save_model(onnx_model, "TinyViT.onnx") onnx_model = onnx.load("TinyViT.onnx") # Parse onnx @@ -386,17 +367,6 @@ def main(): input_name, ) = onnx_parser(onnx_model) - inputs_to_save = [ - sample_input.numpy(), - ] + list(parameter_arrays.values()) - - np.savez("inputs.npz", *inputs_to_save) - np.savez("outputs.npz", input=model(sample_input).detach()) - - onnx.save_model(onnx_model, "TinyViT.onnx") - - input("Stop if only exporting ONNX model.") - # Write input sequence input_writer( file_root_dir=root_dir, input_name=input_name, input_array=sample_input