diff --git a/lib/sources/pulp_mhsa_fp16.c b/lib/sources/pulp_mhsa_fp16.c index e7b62e8a..45c0aacd 100644 --- a/lib/sources/pulp_mhsa_fp16.c +++ b/lib/sources/pulp_mhsa_fp16.c @@ -34,7 +34,7 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { fp16 *inputData = mhsa_args->input->data; // Input vector (Transposed, E x L) fp16 *outData = mhsa_args->output->data; // Output sequence (Transposed, E x L) fp16 *temp = mhsa_args->temp_buffer; // Support buffer used in the attention head loop - fp16 *softmax_buffer = mhsa_args->softmax_buffer->data; // Buffer containing the softmax results (necessary to save for backward pass) + fp16 *softmax_buffer = mhsa_args->softmax_buffer->data; // Buffer containing the softmax results (necessary to save for backward pass); TODO: Only save in its entirety for bw; otherwise just for the respective head part fp16 *maxes = mhsa_args->maxes; // Buffer containing the row-wise maxes in the softmax process fp16 *sums = mhsa_args->sums; // Buffer containing the row-wise exponential sums in the softmax process fp16 *q = mhsa_args->q->data; // Pointer to the first element of Q @@ -48,9 +48,9 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { int E = mhsa_args->input->W; // Input Sequence element size int F = mhsa_args->attention_map->W; // Hidden dimension of attention (N. Heads * Head dimension) -#ifdef DEBUG + #ifdef DEBUG printf("\n~~~~~~~~~~~~~~~FORWARD PASS~~~~~~~~~~~~~~~\n\nPrinting the parameters: L-%d, E-%d, F-%d", L, E, F); -#endif + #endif int H = F / n_heads; // Head dimension fp16 scaling = (fp16)(1 / sqrt(H)); // Scaling factor to avoid vanishing gradients @@ -76,8 +76,8 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { // T0_q struct transp_args_fp16 transp_args0_q; - int dim[] = {E, F}; - int tr_axes[] = {1, 0}; + int dim[2] = {E, F}; + int tr_axes[2] = {1, 0}; transp_args0_q.in_matrix = coeffDataWinQ; transp_args0_q.out_matrix = temp; @@ -87,21 +87,21 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { pi_cl_team_fork(NUM_CORES, transpose_fp16, &transp_args0_q); -#ifdef DEBUG + #ifdef DEBUG printf("\n\n\nT0_q result\n\ncoeffDataWinQ [^T]: %d %d\n", E, F); for (int j=0; j F ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -604,19 +614,20 @@ void pulp_mhsa_fp16_fw_cl(void *Mhsa_args) { matMul_args4.K = F; matMul_args4.M = L; matMul_args4.trans_B = 0; + matMul_args4.USE_BIASES = 0; -#ifndef OPTIMIZE + #ifndef OPTIMIZE pi_cl_team_fork(NUM_CORES, mm_fp16, &matMul_args4); -#else + #else struct mm_manager_args_fp16 man_args4; man_args4.mm_args = &matMul_args4; man_args4.layer_type = LAYER_LINEAR; man_args4.step_type = STEP_FW; man_args4.matmul_type = opt_matmul_type; //MATMUL_TYPE pi_cl_team_fork(NUM_CORES, mm_manager_fp16, &man_args4); -#endif + #endif -#ifdef DEBUG + #ifdef DEBUG printf("\n\n\nM4 result\n\ncoeffDataWout: %d %d\n", E, F); for (int j=0; jmatrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 2 * tile_dim, 2 * M, 2 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load); + pi_cl_dma_cmd_2d((uint32_t) (args->in_matrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 2 * tile_dim, 2 * M, 2 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load); pi_cl_dma_cmd_wait(cmd_load); pi_cl_team_fork(NUM_CORES, transpose_fp16, &args_l1); - pi_cl_dma_cmd_2d((uint32_t) (args->transp_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 2 * tile_dim, 2 * N, 2 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store); + pi_cl_dma_cmd_2d((uint32_t) (args->out_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 2 * tile_dim, 2 * N, 2 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store); pi_cl_dma_cmd_wait(cmd_store); } } @@ -2225,8 +2256,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ for (int i = 0; i < n_heads; i++) { // T1 struct transp_args_fp16 transp_args1; - transp_args1.matrix = kt + L * i * H; - transp_args1.transp_matrix = temp; + transp_args1.in_matrix = kt + L * i * H; + transp_args1.out_matrix = temp; transp_args1.N = H; transp_args1.M = L; @@ -2279,8 +2310,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // row-wise max and sums, therefore it is necessary to transpose the current head buffer. // T2 struct transp_args_fp16 transp_args2; - transp_args2.matrix = softmax_buffer + i * L * L; - transp_args2.transp_matrix = temp; + transp_args2.in_matrix = softmax_buffer + i * L * L; + transp_args2.out_matrix = temp; transp_args2.N = L; transp_args2.M = L; @@ -2329,8 +2360,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // Each head result has to be appended to the full attention map, to do so we require to store the current // softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again. struct transp_args_fp16 transp_args3; - transp_args3.matrix = softmax_buffer + i * L * L; - transp_args3.transp_matrix = temp; + transp_args3.in_matrix = softmax_buffer + i * L * L; + transp_args3.out_matrix = temp; transp_args3.N = L; transp_args3.M = L; @@ -2376,8 +2407,8 @@ void tiled_mhsa_fp16(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // T4 // The last transpose to original shape struct transp_args_fp16 transp_args4; - transp_args4.matrix = temp; - transp_args4.transp_matrix = outData; + transp_args4.in_matrix = temp; + transp_args4.out_matrix = outData; transp_args4.N = mhsa_args->input_bn->W; transp_args4.M = L; diff --git a/lib/sources/pulp_mhsa_fp32.c b/lib/sources/pulp_mhsa_fp32.c index 8bb70edf..b4147044 100644 --- a/lib/sources/pulp_mhsa_fp32.c +++ b/lib/sources/pulp_mhsa_fp32.c @@ -75,7 +75,7 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) { // T0_q int dims[] = {E, F}; - int t_axes = {1, 0}; + int t_axes[] = {1, 0}; struct transp_args transp_args0_q; transp_args0_q.in_matrix = coeffDataWinQ; @@ -90,14 +90,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) { printf("\n\n\nT0_q result\n\ncoeffDataWinQ [^T]: %d %d\n", E, F); for (int j=0; j L x L ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // T1 - dims = {H, L}; + dims[0] = H; + dims[1] = L; struct transp_args transp_args1; @@ -351,14 +354,14 @@ void pulp_mhsa_fp32_fw_cl(void *Mhsa_args) { printf("\n\n\nHead %d - T1 result\n\nk: %d %d\n", i, H, L); for (int j=0; j H x L ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // T6 - dims[] = {H, L}; + dims[0] = H; + dims[1] = L; struct transp_args transp_args6; @@ -1229,14 +1239,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) { printf("\n\n\nHead %d - T6 result\n\nq: %d %d\n", i, H, L); for (int j=0; j E x L ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // T8 - dims[] = {E, L}; + dims[0] = E; + dims[1] = L; struct transp_args transp_args8; @@ -1439,14 +1451,14 @@ void pulp_mhsa_fp32_bw_cl(void *Mhsa_args) { printf("\n\n\nT8 result\n\ninputData: %d %d\n", E, L); for (int j=0; jmatrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 4 * tile_dim, 4 * M, 4 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load); + pi_cl_dma_cmd_2d((uint32_t) (args->in_matrix + i * tile_h + j * tile_w * M), (uint32_t) (IN_DATA), 4 * tile_dim, 4 * M, 4 * tile_h, PI_CL_DMA_DIR_EXT2LOC, cmd_load); pi_cl_dma_cmd_wait(cmd_load); pi_cl_team_fork(NUM_CORES, transpose, &args_l1); - pi_cl_dma_cmd_2d((uint32_t) (args->transp_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 4 * tile_dim, 4 * N, 4 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store); + pi_cl_dma_cmd_2d((uint32_t) (args->out_matrix + j * tile_w + i * tile_h * N), (uint32_t) (OUT_DATA), 4 * tile_dim, 4 * N, 4 * tile_w, PI_CL_DMA_DIR_LOC2EXT, cmd_store); pi_cl_dma_cmd_wait(cmd_store); } } @@ -2248,8 +2253,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ for (int i = 0; i < n_heads; i++) { // T1 struct transp_args transp_args1; - transp_args1.matrix = kt + L * i * H; - transp_args1.transp_matrix = temp; + transp_args1.in_matrix = kt + L * i * H; + transp_args1.out_matrix = temp; transp_args1.N = H; transp_args1.M = L; @@ -2303,8 +2308,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // row-wise max and sums, therefore it is necessary to transpose the current head buffer. // T2 struct transp_args transp_args2; - transp_args2.matrix = softmax_buffer + i * L * L; - transp_args2.transp_matrix = temp; + transp_args2.in_matrix = softmax_buffer + i * L * L; + transp_args2.out_matrix = temp; transp_args2.N = L; transp_args2.M = L; @@ -2354,8 +2359,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // Each head result has to be appended to the full attention map, to do so we require to store the current // softmax buffer data following the H x L convention, therefore we need to transpose the memory buffer again. struct transp_args transp_args3; - transp_args3.matrix = softmax_buffer + i * L * L; - transp_args3.transp_matrix = temp; + transp_args3.in_matrix = softmax_buffer + i * L * L; + transp_args3.out_matrix = temp; transp_args3.N = L; transp_args3.M = L; @@ -2401,8 +2406,8 @@ void tiled_mhsa_fp32(void *Mhsa_args, void* Tiled_mhsa_matmul_args){ // T4 // The last transpose to original shape struct transp_args transp_args4; - transp_args4.matrix = temp; - transp_args4.transp_matrix = outData; + transp_args4.in_matrix = temp; + transp_args4.out_matrix = outData; transp_args4.N = mhsa_args->input_bn->W; transp_args4.M = L; @@ -2460,8 +2465,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T1 struct transp_args transp_args1; - transp_args1.matrix = attention_map; - transp_args1.transp_matrix = temp; + transp_args1.in_matrix = attention_map; + transp_args1.out_matrix = temp; transp_args1.N = F; transp_args1.M = L; @@ -2491,8 +2496,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T2 struct transp_args transp_args2; - transp_args2.matrix = coeffDataWout; - transp_args2.transp_matrix = temp; + transp_args2.in_matrix = coeffDataWout; + transp_args2.out_matrix = temp; transp_args2.N = E; transp_args2.M = F; @@ -2546,8 +2551,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T3 struct transp_args transp_args3; - transp_args3.matrix = v + i * L * H; - transp_args3.transp_matrix = temp; + transp_args3.in_matrix = v + i * L * H; + transp_args3.out_matrix = temp; transp_args3.N = H; transp_args3.M = L; @@ -2601,8 +2606,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ for (int i = 0; i < n_heads; i++) { // T4 struct transp_args transp_args4; - transp_args4.matrix = softmax_buffer_diff + i * L * L; - transp_args4.transp_matrix = temp; + transp_args4.in_matrix = softmax_buffer_diff + i * L * L; + transp_args4.out_matrix = temp; transp_args4.N = L; transp_args4.M = L; @@ -2621,8 +2626,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T5 struct transp_args transp_args5; - transp_args5.matrix = grad; - transp_args5.transp_matrix = temp; + transp_args5.in_matrix = grad; + transp_args5.out_matrix = temp; transp_args5.N = L; transp_args5.M = L; @@ -2645,8 +2650,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T6 struct transp_args transp_args6; - transp_args6.matrix = q + i * L * H; - transp_args6.transp_matrix = temp; + transp_args6.in_matrix = q + i * L * H; + transp_args6.out_matrix = temp; transp_args6.N = H; transp_args6.M = L; @@ -2676,8 +2681,8 @@ void pulp_mhsa_mobilebert_inference_fp32_bw_cl(void *Mhsa_args){ // T7 struct transp_args transp_args7; - transp_args7.matrix = k_diff + i * L * H; - transp_args7.transp_matrix = temp; + transp_args7.in_matrix = k_diff + i * L * H; + transp_args7.out_matrix = temp; transp_args7.N = L; transp_args7.M = H; diff --git a/lib/sources/pulp_residual_fp16.c b/lib/sources/pulp_residual_fp16.c index ba0917b8..72d033c0 100644 --- a/lib/sources/pulp_residual_fp16.c +++ b/lib/sources/pulp_residual_fp16.c @@ -41,20 +41,14 @@ void pulp_residualconn_fp16_fw(void *SkipConn_args_fp16) { return; } - int dims[] = {out->dim}; - struct vect_sum_args_fp16 args_sum; + args_sum.op_1 = skip->data; args_sum.op_2 = lout->data; args_sum.dest = out->data; + args_sum.size = out->dim; - args_sum.op_1_dims = dims; - args_sum.op_2_dims = dims; - - args_sum.op_1_dims_len = 1; - args_sum.op_2_dims_len = 1; - - pi_cl_team_fork(NUM_CORES, array_broadcast_sum_fp16, &args_sum); + pi_cl_team_fork(NUM_CORES, vect_sum_fp16, &args_sum); } @@ -77,21 +71,14 @@ void pulp_sumnode_fp16_bw(void *SkipConn_args_fp16) { return; } - int dims[] = {skip->dim}; - struct vect_sum_args_fp16 args_sum; args_sum.op_1 = out->diff; args_sum.op_2 = skip->diff; args_sum.dest = skip->diff; + args_sum.size = skip->dim; - args_sum.op_1_dims = dims; - args_sum.op2_dims = dims; - - args_sum.op_1_dims_len = 1; - args_sum.op_2_dims_len = 1; - - pi_cl_team_fork(NUM_CORES, array_broadcast_sum_fp16, &args_sum); + pi_cl_team_fork(NUM_CORES, vect_sum_fp16, &args_sum); } } diff --git a/lib/sources/pulp_rnn_fp32.c b/lib/sources/pulp_rnn_fp32.c index 8852a715..a8b9ce43 100644 --- a/lib/sources/pulp_rnn_fp32.c +++ b/lib/sources/pulp_rnn_fp32.c @@ -244,13 +244,13 @@ void pulp_rnn_fp32_bw_cl(void *Rnn_args) { // Calculate gradient for State Weights // Transpose State - int dims[] = {N, M}; - int t_axes[] = {1, 0}; + dims[0] = N; + dims[1] = M; struct transp_args transp_args2; - transp_args2.matrix = hiddState; - transp_args2.transp_matrix = temp; + transp_args2.in_matrix = hiddState; + transp_args2.out_matrix = temp; transp_args2.dim = dims; transp_args2.transposed_axes = t_axes; transp_args2.n_dim = 2; @@ -301,7 +301,8 @@ void pulp_rnn_fp32_bw_cl(void *Rnn_args) { // Calculate the Gradient of the Input // Transpose Input Weights - dims = {K, M}; + dims[0] = K; + dims[1] = M; struct transp_args transp_args3; diff --git a/tests/test_matmul/utils/GM.py b/tests/test_matmul/utils/GM.py index dec5b153..33b952e6 100644 --- a/tests/test_matmul/utils/GM.py +++ b/tests/test_matmul/utils/GM.py @@ -127,12 +127,41 @@ B.transpose(0, 1) else: C = torch.mm(input=A, mat2=B, out=C) + elif (data_type == 'bf16'): + # Matrices to be multiplied + A = torch.Tensor(in_size, mid_size).to(torch.bfloat16) + if transp == '1': + B = torch.Tensor(out_size, mid_size).to(torch.bfloat16) + else: + B = torch.Tensor(mid_size, out_size).to(torch.bfloat16) + C = torch.Tensor(in_size, out_size).to(torch.bfloat16) + + A = torch.div(torch.randn(in_size, mid_size), divider).to(torch.bfloat16) + for i in range(A.shape[0]): + for j in range(A.shape[1]): + A[i][j] += (i+j+0.1)/divider + + if transp == '1': + B = torch.zeros(out_size, mid_size).to(torch.bfloat16) + else: + B = torch.zeros(mid_size, out_size).to(torch.bfloat16) + + for i in range(B.shape[0]): + for j in range(B.shape[1]): + B[i][j] = i*j+0.1 + + if transp == '1': + C = torch.mm(input=A, mat2=B.transpose(0, 1), out=C) + B.transpose(0, 1) + else: + C = torch.mm(input=A, mat2=B, out=C) else : # Error message print('Invalid data type selection!!') exit() - + if data_type == 'bf16': + data_type = 'fp16' # Print data and create data header file f = open('net_args.h', "w") diff --git a/tests/test_mhsa_fp16/Makefile b/tests/test_mhsa_fp16/Makefile index 0b4acd74..107907a1 100644 --- a/tests/test_mhsa_fp16/Makefile +++ b/tests/test_mhsa_fp16/Makefile @@ -1,10 +1,10 @@ APP = mhsa_fp16 # User settings -IN_H?=196 # Sequence Length -IN_W?=160 # Token Size -N_HEADS?=5 -ATT_DIM?=160 #Hidden dimension +IN_H?=20 # Sequence Length +IN_W?=40 # Token Size +N_HEADS?=2 +ATT_DIM?=40 #Hidden dimension IN_CH?=1 OUT_CH?=1 diff --git a/tests/test_mhsa_fp16/net.h b/tests/test_mhsa_fp16/net.h index 7e4fabca..2149e63d 100644 --- a/tests/test_mhsa_fp16/net.h +++ b/tests/test_mhsa_fp16/net.h @@ -34,8 +34,8 @@ #define Tker_l0 (Tin_l0*Tout_l0) // Tensor checksum definition -#define CHECK_TOLERANCE 0.001 -#define ERROR_TOLERANCE 0.001 +#define CHECK_TOLERANCE 0x00000021 +#define ERROR_TOLERANCE 0x00000001 // PULP DEFINES #define STACK_SIZE 4096 diff --git a/tests/test_mhsa_fp16/utils/GM.py b/tests/test_mhsa_fp16/utils/GM.py index 6dd68596..78b6a95d 100644 --- a/tests/test_mhsa_fp16/utils/GM.py +++ b/tests/test_mhsa_fp16/utils/GM.py @@ -1,402 +1,402 @@ -""" -Copyright (C) 2021-2022 ETH Zurich and University of Bologna -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -Authors: Francesco Conoscenti (francesco.conoscenti@studio.unibo.it), Alberto Dequino (alberto.dequino@unibo.it), - Calin Diaconu (calin.diaconu@studio.unibo.it) -""" - -import argparse -from copy import deepcopy - -import numpy as np # Matrix and vector computation package -import torch -import torch.nn as nn - -import dump_utils as dump -import mhsa - - -class MyNet(nn.Module): - # Define a simple network with a mhsa layer for testing - def __init__(self, in_w, n_heads, att_dim, bf16_format): - super().__init__() - self.mhsa = mhsa.MultiHeadedSelfAttention( - dim=in_w, num_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format - ) - - def forward(self, x, tgt_len): - return self.mhsa(x=x, tgt_len=tgt_len) - - -def hook_fn1(_, __, o): - # Hook to write output gradients - f = open("mhsa-grads.h", "w") - - print("------------Output Grad------------") - for grad in o: - try: - output_grad = torch.transpose(grad, 0, 1) - f.write("#define G_OUTPUT_SIZE " + str(output_grad.numel()) + "\n") - print(output_grad) - - if current_step == "BACKWARD": - f.write( - "PI_L2 fp16 OUTPUT_GRAD[G_OUTPUT_SIZE] = {" - + dump.tensor_to_string(output_grad) - + "};\n" - ) - else: - f.write( - "PI_L2 fp16 OUTPUT_GRAD[G_OUTPUT_SIZE] = {" - + dump.tensor_to_string(output_grad) - + "};\n" - ) - except AttributeError: - print("None found for Gradient (output)") - - f.close() - - -def hook_fn2(_, __, o): - # Hook for writing output to file - cont = 0 - f = open("mhsa-output.h", "w") - - print("------------Output------------") - for grad in o: - try: - if cont == 0: - output_grad = grad - f.write("#define OUTPUT_SIZE " + str(output_grad.numel()) + "\n") - - if bf16_format == 0: - print(output_grad.half()) - f.write( - "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {" - + dump.tensor_to_string(output_grad) - + "};\n" - ) - else: - print(output_grad.bfloat16()) - f.write( - "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {" - + dump.tensor_to_string(output_grad) - + "};\n" - ) - - cont += 1 - except AttributeError: - print("None found for Output") - - f.close() - - -if __name__ == "__main__": - # ~~~~~~~~~~ INTRO ~~~~~~~~~~ - # Set the seed for reproducibility - np.random.seed(seed=1) # <----- Sneed - torch.manual_seed(0) - - # Visualize data with more precision - torch.set_printoptions(precision=10, sci_mode=False) - - # Set up parser - parser = argparse.ArgumentParser("MHSA Layer Test") - parser.add_argument("--in_width", type=int, default=8) # Token size - parser.add_argument("--in_height", type=int, default=4) # Sequence length - parser.add_argument("--ch_in", type=int, default=1) - parser.add_argument("--ch_out", type=int, default=1) - parser.add_argument("--n_heads", type=int, default=8) - parser.add_argument("--weight", type=float, default=0.1) - parser.add_argument("--att_dim", type=int, default=8) - parser.add_argument( - "--bf16_format", type=int, default=1 - ) # if == 1, data format if bfloat16, if 0 is float16 - # Possible steps: FORWARD, BACKWARD_GRAD, BACKWARD_ERROR - parser.add_argument("--step", type=str, default="FORWARD") - - args = parser.parse_args() - - # Read arguments - in_h = args.in_height - in_w = args.in_width - ch_in = args.ch_in - ch_out = args.ch_out - n_heads = args.n_heads - current_step = args.step - weight_init = args.weight - att_dim = args.att_dim - head_dim = int(att_dim / n_heads) - bf16_format = args.bf16_format - - # Write net step to file - f_step = open("step-check.h", "w") - f_step.write("#define " + str(current_step) + "\n") - f_step.close() - - # Write input/output weights to file - f = open("init-defines.h", "w") - - f.write("#define Tin_C_l1 " + str(ch_in) + "\n") - f.write("#define Tin_H_l1 " + str(in_h) + "\n") - f.write("#define Tin_W_l1 " + str(in_w) + "\n") - f.write("#define Tout_C_l1 " + str(ch_out) + "\n") - f.write("#define Tn_heads_l1 " + str(n_heads) + "\n") - f.write("#define Tatt_dim_l1 " + str(att_dim) + "\n") - f.write("#define Thead_dim_l1 " + str(head_dim) + "\n") - if current_step == "FORWARD": - f.write( - "#define Ttemp_max " + str(int(max(in_h * head_dim, in_h * in_h))) + "\n" - ) - else: - f.write( - "#define Ttemp_max " - + str( - int(max(in_h * att_dim, 3 * att_dim * in_w, in_h * in_h, in_h * in_w)) - ) - + "\n" - ) - - f.close() - - # Define network and add hook - if bf16_format == 0: - net = MyNet( - in_w=in_w, n_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format - ).half() - elif bf16_format == 1: - net = MyNet( - in_w=in_w, n_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format - ).bfloat16() - net.zero_grad() - - gradsRnn = net.mhsa.register_full_backward_hook(hook_fn1) - - # ~~~~~~~~~~ MANAGE INPUT ~~~~~~~~~~ - # Generate random input data - inp = torch.randn(ch_in, in_h, in_w) - - # Print input data to terminal - print("------------Input sequence------------") - print(inp) - - # Write transpose of input data to file - inp_copy = torch.transpose(inp, -1, -2) - - f = open("input-sequence.h", "w") - f.write("#define INPUT_SIZE " + str(inp.numel()) + "\n") - f.write( - "PI_L2 fp16 INPUT[INPUT_SIZE] = {" + dump.tensor_to_string(inp_copy) + "};\n" - ) - f.close() - - if bf16_format == 0: - inp = inp.half() - else: - inp = inp.bfloat16() - inp.requires_grad = True - - # ~~~~~~~~~~ MANAGE INPUT WEIGHTS ~~~~~~~~~~ - # Generate random input weights - in_wgt_init_tensor_q = torch.randn(att_dim, in_w) - in_wgt_init_tensor_k = torch.randn(att_dim, in_w) - in_wgt_init_tensor_v = torch.randn(att_dim, in_w) - - in_bias_init_tensor_q = torch.randn(att_dim) - in_bias_init_tensor_k = torch.randn(att_dim) - in_bias_init_tensor_v = torch.randn(att_dim) - - # Copy input weights to network - with torch.no_grad(): - if bf16_format == 0: - net.mhsa.proj_q.weight.data = deepcopy(in_wgt_init_tensor_q.half()) - net.mhsa.proj_k.weight.data = deepcopy(in_wgt_init_tensor_k.half()) - net.mhsa.proj_v.weight.data = deepcopy(in_wgt_init_tensor_v.half()) - - net.mhsa.proj_q.bias.data = deepcopy(in_bias_init_tensor_q.half()) - net.mhsa.proj_k.bias.data = deepcopy(in_bias_init_tensor_k.half()) - net.mhsa.proj_v.bias.data = deepcopy(in_bias_init_tensor_v.half()) - else: - net.mhsa.proj_q.weight.data = deepcopy(in_wgt_init_tensor_q.bfloat16()) - net.mhsa.proj_k.weight.data = deepcopy(in_wgt_init_tensor_k.bfloat16()) - net.mhsa.proj_v.weight.data = deepcopy(in_wgt_init_tensor_v.bfloat16()) - - net.mhsa.proj_q.bias.data = deepcopy(in_bias_init_tensor_q.bfloat16()) - net.mhsa.proj_k.bias.data = deepcopy(in_bias_init_tensor_k.bfloat16()) - net.mhsa.proj_v.bias.data = deepcopy(in_bias_init_tensor_v.bfloat16()) - - # Print input weights to terminal - print("Shape input weights:") - print(net.mhsa.proj_q.weight.shape) - print("Shape input biases:") - print(net.mhsa.proj_q.bias.shape) - print("q:") - print(net.mhsa.proj_q.weight.data) - print("k:") - print(net.mhsa.proj_k.weight.data) - print("v:") - print(net.mhsa.proj_v.weight.data) - print("\n") - - # Write input weights to init file - f = open("init-defines.h", "a") - f.write("\n\n// Input Projections Weight Initialization\n") - f.write("#define INPUT_WGT_SIZE (" + str(in_wgt_init_tensor_q.numel()) + ")\n") - f.write( - "PI_L2 fp16 INPUT_WEIGHTS_Q[INPUT_WGT_SIZE] = {" - + dump.tensor_to_string(in_wgt_init_tensor_q.transpose(0, 1)) - + "};\n" - ) - f.write( - "PI_L2 fp16 INPUT_WEIGHTS_K[INPUT_WGT_SIZE] = {" - + dump.tensor_to_string(in_wgt_init_tensor_k.transpose(0, 1)) - + "};\n" - ) - f.write( - "PI_L2 fp16 INPUT_WEIGHTS_V[INPUT_WGT_SIZE] = {" - + dump.tensor_to_string(in_wgt_init_tensor_v.transpose(0, 1)) - + "};\n" - ) - - f.write("\n\n// Input Projections Biases Initialization\n") - f.write("#define INPUT_BIAS_SIZE (" + str(in_bias_init_tensor_q.numel()) + ")\n") - f.write( - "PI_L2 fp16 INPUT_BIASES_Q[INPUT_BIAS_SIZE] = {" - + dump.tensor_to_string(in_bias_init_tensor_q) - + "};\n" - ) - f.write( - "PI_L2 fp16 INPUT_BIASES_K[INPUT_BIAS_SIZE] = {" - + dump.tensor_to_string(in_bias_init_tensor_k) - + "};\n" - ) - f.write( - "PI_L2 fp16 INPUT_BIASES_V[INPUT_BIAS_SIZE] = {" - + dump.tensor_to_string(in_bias_init_tensor_v) - + "};\n" - ) - f.close() - - # ~~~~~~~~~~ MANAGE OUTPUT WEIGHTS ~~~~~~~~~~ - # Generate random output weights - output_proj_wgt_init_tensor = torch.randn(in_w, att_dim) - - # Copy output weights to network - with torch.no_grad(): - if bf16_format == 0: - net.mhsa.proj_out.weight.data = deepcopy(output_proj_wgt_init_tensor.half()) - else: - net.mhsa.proj_out.weight.data = deepcopy( - output_proj_wgt_init_tensor.bfloat16() - ) - - # Print output weights to terminal - print("Shape output projection weights:") - print(net.mhsa.proj_out.weight.data.shape) - print(net.mhsa.proj_out.weight.data) - print("\n") - - # Write output weights to init file - f = open("init-defines.h", "a") - f.write("\n\n") - f.write( - "#define OUTPUT_WGT_SIZE (" + str(output_proj_wgt_init_tensor.numel()) + ")\n" - ) - f.write( - "PI_L2 fp16 OUTPUT_WEIGHTS[OUTPUT_WGT_SIZE] = {" - + dump.tensor_to_string(output_proj_wgt_init_tensor) - + "};\n" - ) - f.close() - - # ~~~~~~~~~~ COMPUTE OUTPUT ~~~~~~~~~~ - if bf16_format == 0: - label = torch.ones(in_h, in_w).half() - else: - label = torch.ones(in_h, in_w).bfloat16() - criterion = nn.MSELoss() - out = net(x=inp, tgt_len=in_h) - - # Print output to terminal - print("out: ") - print(out.size()) - print(label.size()) - print(out) - - # Compute loss - loss = criterion(out.float(), label.float()) - - # Write output to file - out_copy = torch.transpose(out, -1, -2) - - f = open("mhsa-output.h", "w") - f.write("#define OUTPUT_SIZE " + str(out.numel()) + "\n") - f.write( - "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {" + dump.tensor_to_string(out_copy) + "};\n" - ) - f.close() - - # Compute gradients - net.zero_grad() - loss.backward() - - input_wgt_grad_q = net.mhsa.proj_q.weight.grad - input_wgt_grad_k = net.mhsa.proj_k.weight.grad - input_wgt_grad_v = net.mhsa.proj_v.weight.grad - output_wgt_grad = net.mhsa.proj_out.weight.grad - input_grad = inp.grad.transpose(1, 2) - - # Write gradients to file - f = open("mhsa-grads.h", "a") - - f.write("#define G_INPUT_WGT_SIZE " + str(input_wgt_grad_q.numel()) + "\n") - f.write( - "PI_L2 fp16 INPUT_WGT_GRAD_Q[G_INPUT_WGT_SIZE] = {" - + dump.tensor_to_string(input_wgt_grad_q) - + "};\n" - ) - f.write( - "PI_L2 fp16 INPUT_WGT_GRAD_K[G_INPUT_WGT_SIZE] = {" - + dump.tensor_to_string(input_wgt_grad_k) - + "};\n" - ) - f.write( - "PI_L2 fp16 INPUT_WGT_GRAD_V[G_INPUT_WGT_SIZE] = {" - + dump.tensor_to_string(input_wgt_grad_v) - + "};\n" - ) - - f.write("#define G_OUTPUT_WGT_SIZE " + str(output_wgt_grad.numel()) + "\n") - f.write( - "PI_L2 fp16 OUTPUT_WGT_GRAD[G_OUTPUT_WGT_SIZE] = {" - + dump.tensor_to_string(output_wgt_grad) - + "};\n" - ) - - f.write("#define G_IN_SIZE " + str(input_grad.numel()) + "\n") - f.write( - "PI_L2 fp16 INPUT_GRAD[G_IN_SIZE] = {" - + dump.tensor_to_string(input_grad) - + "};\n" - ) - - f.close() - - # Write attention scores to file - f = open("attention_scores.h", "w") - f.write("#define ATTENTION_S_LENGTH " + str(net.mhsa.scores.numel()) + "\n") - f.write( - "PI_L2 fp16 ATTENTION_SCORES[ATTENTION_S_LENGTH] = {" - + dump.tensor_to_string(torch.transpose(net.mhsa.scores, 0, 1)) - + "};\n" - ) - f.close() +""" +Copyright (C) 2021-2022 ETH Zurich and University of Bologna +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Authors: Francesco Conoscenti (francesco.conoscenti@studio.unibo.it), Alberto Dequino (alberto.dequino@unibo.it), + Calin Diaconu (calin.diaconu@studio.unibo.it) +""" + +import argparse +from copy import deepcopy + +import numpy as np # Matrix and vector computation package +import torch +import torch.nn as nn + +import dump_utils as dump +import mhsa + + +class MyNet(nn.Module): + # Define a simple network with a mhsa layer for testing + def __init__(self, in_w, n_heads, att_dim, bf16_format): + super().__init__() + self.mhsa = mhsa.MultiHeadedSelfAttention( + dim=in_w, num_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format + ) + + def forward(self, x, tgt_len): + return self.mhsa(x=x, tgt_len=tgt_len) + + +def hook_fn1(_, __, o): + # Hook to write output gradients + f = open("mhsa-grads.h", "w") + + print("------------Output Grad------------") + for grad in o: + try: + output_grad = torch.transpose(grad, 0, 1) + f.write("#define G_OUTPUT_SIZE " + str(output_grad.numel()) + "\n") + print(output_grad) + + if current_step == "BACKWARD": + f.write( + "PI_L2 fp16 OUTPUT_GRAD[G_OUTPUT_SIZE] = {" + + dump.tensor_to_string(output_grad) + + "};\n" + ) + else: + f.write( + "PI_L2 fp16 OUTPUT_GRAD[G_OUTPUT_SIZE] = {" + + dump.tensor_to_string(output_grad) + + "};\n" + ) + except AttributeError: + print("None found for Gradient (output)") + + f.close() + + +def hook_fn2(_, __, o): + # Hook for writing output to file + cont = 0 + f = open("mhsa-output.h", "w") + + print("------------Output------------") + for grad in o: + try: + if cont == 0: + output_grad = grad + f.write("#define OUTPUT_SIZE " + str(output_grad.numel()) + "\n") + + if bf16_format == 0: + print(output_grad.half()) + f.write( + "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {" + + dump.tensor_to_string(output_grad) + + "};\n" + ) + else: + print(output_grad.bfloat16()) + f.write( + "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {" + + dump.tensor_to_string(output_grad) + + "};\n" + ) + + cont += 1 + except AttributeError: + print("None found for Output") + + f.close() + + +if __name__ == "__main__": + # ~~~~~~~~~~ INTRO ~~~~~~~~~~ + # Set the seed for reproducibility + np.random.seed(seed=1) # <----- Sneed + torch.manual_seed(0) + + # Visualize data with more precision + torch.set_printoptions(precision=10, sci_mode=False) + + # Set up parser + parser = argparse.ArgumentParser("MHSA Layer Test") + parser.add_argument("--in_width", type=int, default=8) # Token size + parser.add_argument("--in_height", type=int, default=4) # Sequence length + parser.add_argument("--ch_in", type=int, default=1) + parser.add_argument("--ch_out", type=int, default=1) + parser.add_argument("--n_heads", type=int, default=8) + parser.add_argument("--weight", type=float, default=0.1) + parser.add_argument("--att_dim", type=int, default=8) + parser.add_argument( + "--bf16_format", type=int, default=1 + ) # if == 1, data format if bfloat16, if 0 is float16 + # Possible steps: FORWARD, BACKWARD_GRAD, BACKWARD_ERROR + parser.add_argument("--step", type=str, default="FORWARD") + + args = parser.parse_args() + + # Read arguments + in_h = args.in_height + in_w = args.in_width + ch_in = args.ch_in + ch_out = args.ch_out + n_heads = args.n_heads + current_step = args.step + weight_init = args.weight + att_dim = args.att_dim + head_dim = int(att_dim / n_heads) + bf16_format = args.bf16_format + + # Write net step to file + f_step = open("step-check.h", "w") + f_step.write("#define " + str(current_step) + "\n") + f_step.close() + + # Write input/output weights to file + f = open("init-defines.h", "w") + + f.write("#define Tin_C_l1 " + str(ch_in) + "\n") + f.write("#define Tin_H_l1 " + str(in_h) + "\n") + f.write("#define Tin_W_l1 " + str(in_w) + "\n") + f.write("#define Tout_C_l1 " + str(ch_out) + "\n") + f.write("#define Tn_heads_l1 " + str(n_heads) + "\n") + f.write("#define Tatt_dim_l1 " + str(att_dim) + "\n") + f.write("#define Thead_dim_l1 " + str(head_dim) + "\n") + if current_step == "FORWARD": + f.write( + "#define Ttemp_max " + str(int(max(in_w * head_dim, in_h * head_dim, in_h * in_h, in_w * in_w))) + "\n" + ) + else: + f.write( + "#define Ttemp_max " + + str( + int(max(in_h * att_dim, 3 * att_dim * in_w, in_h * in_h, in_h * in_w)) + ) + + "\n" + ) + + f.close() + + # Define network and add hook + if bf16_format == 0: + net = MyNet( + in_w=in_w, n_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format + ).half() + elif bf16_format == 1: + net = MyNet( + in_w=in_w, n_heads=n_heads, att_dim=att_dim, bf16_format=bf16_format + ).bfloat16() + net.zero_grad() + + gradsRnn = net.mhsa.register_full_backward_hook(hook_fn1) + + # ~~~~~~~~~~ MANAGE INPUT ~~~~~~~~~~ + # Generate random input data + inp = torch.randn(ch_in, in_h, in_w) + + # Print input data to terminal + print("------------Input sequence------------") + print(inp) + + # Write transpose of input data to file + inp_copy = torch.transpose(inp, -1, -2) + + f = open("input-sequence.h", "w") + f.write("#define INPUT_SIZE " + str(inp.numel()) + "\n") + f.write( + "PI_L2 fp16 INPUT[INPUT_SIZE] = {" + dump.tensor_to_string(inp_copy) + "};\n" + ) + f.close() + + if bf16_format == 0: + inp = inp.half() + else: + inp = inp.bfloat16() + inp.requires_grad = True + + # ~~~~~~~~~~ MANAGE INPUT WEIGHTS ~~~~~~~~~~ + # Generate random input weights + in_wgt_init_tensor_q = torch.randn(att_dim, in_w) + in_wgt_init_tensor_k = torch.randn(att_dim, in_w) + in_wgt_init_tensor_v = torch.randn(att_dim, in_w) + + in_bias_init_tensor_q = torch.randn(att_dim) + in_bias_init_tensor_k = torch.randn(att_dim) + in_bias_init_tensor_v = torch.randn(att_dim) + + # Copy input weights to network + with torch.no_grad(): + if bf16_format == 0: + net.mhsa.proj_q.weight.data = deepcopy(in_wgt_init_tensor_q.half()) + net.mhsa.proj_k.weight.data = deepcopy(in_wgt_init_tensor_k.half()) + net.mhsa.proj_v.weight.data = deepcopy(in_wgt_init_tensor_v.half()) + + net.mhsa.proj_q.bias.data = deepcopy(in_bias_init_tensor_q.half()) + net.mhsa.proj_k.bias.data = deepcopy(in_bias_init_tensor_k.half()) + net.mhsa.proj_v.bias.data = deepcopy(in_bias_init_tensor_v.half()) + else: + net.mhsa.proj_q.weight.data = deepcopy(in_wgt_init_tensor_q.bfloat16()) + net.mhsa.proj_k.weight.data = deepcopy(in_wgt_init_tensor_k.bfloat16()) + net.mhsa.proj_v.weight.data = deepcopy(in_wgt_init_tensor_v.bfloat16()) + + net.mhsa.proj_q.bias.data = deepcopy(in_bias_init_tensor_q.bfloat16()) + net.mhsa.proj_k.bias.data = deepcopy(in_bias_init_tensor_k.bfloat16()) + net.mhsa.proj_v.bias.data = deepcopy(in_bias_init_tensor_v.bfloat16()) + + # Print input weights to terminal + print("Shape input weights:") + print(net.mhsa.proj_q.weight.shape) + print("Shape input biases:") + print(net.mhsa.proj_q.bias.shape) + print("q:") + print(net.mhsa.proj_q.weight.data) + print("k:") + print(net.mhsa.proj_k.weight.data) + print("v:") + print(net.mhsa.proj_v.weight.data) + print("\n") + + # Write input weights to init file + f = open("init-defines.h", "a") + f.write("\n\n// Input Projections Weight Initialization\n") + f.write("#define INPUT_WGT_SIZE (" + str(in_wgt_init_tensor_q.numel()) + ")\n") + f.write( + "PI_L2 fp16 INPUT_WEIGHTS_Q[INPUT_WGT_SIZE] = {" + + dump.tensor_to_string(in_wgt_init_tensor_q.transpose(0, 1)) + + "};\n" + ) + f.write( + "PI_L2 fp16 INPUT_WEIGHTS_K[INPUT_WGT_SIZE] = {" + + dump.tensor_to_string(in_wgt_init_tensor_k.transpose(0, 1)) + + "};\n" + ) + f.write( + "PI_L2 fp16 INPUT_WEIGHTS_V[INPUT_WGT_SIZE] = {" + + dump.tensor_to_string(in_wgt_init_tensor_v.transpose(0, 1)) + + "};\n" + ) + + f.write("\n\n// Input Projections Biases Initialization\n") + f.write("#define INPUT_BIAS_SIZE (" + str(in_bias_init_tensor_q.numel()) + ")\n") + f.write( + "PI_L2 fp16 INPUT_BIASES_Q[INPUT_BIAS_SIZE] = {" + + dump.tensor_to_string(in_bias_init_tensor_q) + + "};\n" + ) + f.write( + "PI_L2 fp16 INPUT_BIASES_K[INPUT_BIAS_SIZE] = {" + + dump.tensor_to_string(in_bias_init_tensor_k) + + "};\n" + ) + f.write( + "PI_L2 fp16 INPUT_BIASES_V[INPUT_BIAS_SIZE] = {" + + dump.tensor_to_string(in_bias_init_tensor_v) + + "};\n" + ) + f.close() + + # ~~~~~~~~~~ MANAGE OUTPUT WEIGHTS ~~~~~~~~~~ + # Generate random output weights + output_proj_wgt_init_tensor = torch.randn(in_w, att_dim) + + # Copy output weights to network + with torch.no_grad(): + if bf16_format == 0: + net.mhsa.proj_out.weight.data = deepcopy(output_proj_wgt_init_tensor.half()) + else: + net.mhsa.proj_out.weight.data = deepcopy( + output_proj_wgt_init_tensor.bfloat16() + ) + + # Print output weights to terminal + print("Shape output projection weights:") + print(net.mhsa.proj_out.weight.data.shape) + print(net.mhsa.proj_out.weight.data) + print("\n") + + # Write output weights to init file + f = open("init-defines.h", "a") + f.write("\n\n") + f.write( + "#define OUTPUT_WGT_SIZE (" + str(output_proj_wgt_init_tensor.numel()) + ")\n" + ) + f.write( + "PI_L2 fp16 OUTPUT_WEIGHTS[OUTPUT_WGT_SIZE] = {" + + dump.tensor_to_string(output_proj_wgt_init_tensor) + + "};\n" + ) + f.close() + + # ~~~~~~~~~~ COMPUTE OUTPUT ~~~~~~~~~~ + if bf16_format == 0: + label = torch.ones(in_h, in_w).half() + else: + label = torch.ones(in_h, in_w).bfloat16() + criterion = nn.MSELoss() + out = net(x=inp, tgt_len=in_h) + + # Print output to terminal + print("out: ") + print(out.size()) + print(label.size()) + print(out) + + # Compute loss + loss = criterion(out.float(), label.float()) + + # Write output to file + out_copy = torch.transpose(out, -1, -2) + + f = open("mhsa-output.h", "w") + f.write("#define OUTPUT_SIZE " + str(out.numel()) + "\n") + f.write( + "PI_L2 fp16 OUTPUT[OUTPUT_SIZE] = {" + dump.tensor_to_string(out_copy) + "};\n" + ) + f.close() + + # Compute gradients + net.zero_grad() + loss.backward() + + input_wgt_grad_q = net.mhsa.proj_q.weight.grad + input_wgt_grad_k = net.mhsa.proj_k.weight.grad + input_wgt_grad_v = net.mhsa.proj_v.weight.grad + output_wgt_grad = net.mhsa.proj_out.weight.grad + input_grad = inp.grad.transpose(1, 2) + + # Write gradients to file + f = open("mhsa-grads.h", "a") + + f.write("#define G_INPUT_WGT_SIZE " + str(input_wgt_grad_q.numel()) + "\n") + f.write( + "PI_L2 fp16 INPUT_WGT_GRAD_Q[G_INPUT_WGT_SIZE] = {" + + dump.tensor_to_string(input_wgt_grad_q) + + "};\n" + ) + f.write( + "PI_L2 fp16 INPUT_WGT_GRAD_K[G_INPUT_WGT_SIZE] = {" + + dump.tensor_to_string(input_wgt_grad_k) + + "};\n" + ) + f.write( + "PI_L2 fp16 INPUT_WGT_GRAD_V[G_INPUT_WGT_SIZE] = {" + + dump.tensor_to_string(input_wgt_grad_v) + + "};\n" + ) + + f.write("#define G_OUTPUT_WGT_SIZE " + str(output_wgt_grad.numel()) + "\n") + f.write( + "PI_L2 fp16 OUTPUT_WGT_GRAD[G_OUTPUT_WGT_SIZE] = {" + + dump.tensor_to_string(output_wgt_grad) + + "};\n" + ) + + f.write("#define G_IN_SIZE " + str(input_grad.numel()) + "\n") + f.write( + "PI_L2 fp16 INPUT_GRAD[G_IN_SIZE] = {" + + dump.tensor_to_string(input_grad) + + "};\n" + ) + + f.close() + + # Write attention scores to file + f = open("attention_scores.h", "w") + f.write("#define ATTENTION_S_LENGTH " + str(net.mhsa.scores.numel()) + "\n") + f.write( + "PI_L2 fp16 ATTENTION_SCORES[ATTENTION_S_LENGTH] = {" + + dump.tensor_to_string(torch.transpose(net.mhsa.scores, 0, 1)) + + "};\n" + ) + f.close() diff --git a/tests/test_mhsa_fp32/utils/GM.py b/tests/test_mhsa_fp32/utils/GM.py index 33ddd070..19d22e99 100644 --- a/tests/test_mhsa_fp32/utils/GM.py +++ b/tests/test_mhsa_fp32/utils/GM.py @@ -142,7 +142,7 @@ def hook_fn2(_, __, o): f.write("#define Thead_dim_l1 " + str(head_dim) + "\n") if current_step == "FORWARD": f.write( - "#define Ttemp_max " + str(int(max(in_h * head_dim, in_h * in_h))) + "\n" + "#define Ttemp_max " + str(int(max(in_w * head_dim, in_h * head_dim, in_h * in_h, in_w * in_w))) + "\n" ) else: f.write( diff --git a/tests/test_mhsa_paper_fp16/Makefile b/tests/test_mhsa_paper_fp16/Makefile index a2977183..f6acbbea 100644 --- a/tests/test_mhsa_paper_fp16/Makefile +++ b/tests/test_mhsa_paper_fp16/Makefile @@ -49,7 +49,7 @@ NUM_MATMULS?=24 # When profiling with multiple matmul algorithms NUM_SIZES?=3 # When profiling multiple sizes of the network # End of user settings -TRAIN_LIB=/home/alberto/pulp-trainlib/lib +TRAIN_LIB=../../lib TRAIN_LIB_SRCS=$(TRAIN_LIB)/sources APP_SRCS = main.c net_l1.c diff --git a/tests/test_residual/Makefile b/tests/test_residual/Makefile index 1a9651c8..74dfbb55 100644 --- a/tests/test_residual/Makefile +++ b/tests/test_residual/Makefile @@ -1,8 +1,8 @@ APP = test_residual -CI?=64 -HI?=56 -WI?=56 +CI?=8 +HI?=4 +WI?=6 KER?=1 NUM_CORES?=8 HWC?=0 @@ -20,6 +20,8 @@ APP_SRCS += main.c net.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv2d_fp32.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv2d_fp16.c +APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv_naive_fp32.c +APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_conv_naive_fp16.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_train_utils_fp32.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_train_utils_fp16.c APP_SRCS += $(TRAIN_LIB_SRCS)/pulp_losses_fp32.c diff --git a/tests/test_tiny_vit_fp32/Makefile b/tests/test_tiny_vit_fp32/Makefile index 909ffd7e..bf4b8542 100644 --- a/tests/test_tiny_vit_fp32/Makefile +++ b/tests/test_tiny_vit_fp32/Makefile @@ -5,7 +5,8 @@ NUM_CORES = 8 MATMUL_TYPE?=9 DATA_TYPE?=32 -CONFIG_NAME = "TINY_VIT_5M" +CONFIG_NAME = "DEMO_TINY_VIT_CONFIG" +# CONFIG_NAME = "TINY_VIT_5M" # End of user code TASK_NAME=sst-2