diff --git a/tools/TrainLib_Deployer/TrainLib_Deployer.py b/tools/TrainLib_Deployer/TrainLib_Deployer.py index d05c4968..e16305db 100644 --- a/tools/TrainLib_Deployer/TrainLib_Deployer.py +++ b/tools/TrainLib_Deployer/TrainLib_Deployer.py @@ -1,5 +1,5 @@ ''' -Copyright (C) 2021-2022 ETH Zurich and University of Bologna +Copyright (C) 2021-2024 ETH Zurich and University of Bologna Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ ''' ''' -Authors: Davide Nadalini +Authors: Davide Nadalini, Cristian Cioflan, Axel Vanoni ''' """ @@ -41,6 +41,15 @@ 'SGD' -> Stochastic Gradient Descent """ +import argparse +import onnx +import os + +from onnx import shape_inference, numpy_helper + +import numpy as np + + import deployer_utils.DNN_Reader as reader import deployer_utils.DNN_Composer as composer @@ -48,9 +57,21 @@ # --- USER SETTINGS --- # --------------------- +parser = argparse.ArgumentParser( + prog='Deployer', + description='Generating C code for on-device training') + +parser = argparse.ArgumentParser() +parser.add_argument('--project_name', type=str, default="Test_CNN", help='Project name') +parser.add_argument('--project_path', type=str, default="./", help='Project path') +parser.add_argument('--model_path', type=str, default=None, help='Pretrained model path') +parser.add_argument('--start_at', type=str, default=None, help='At which node to start generating') +args = parser.parse_args() + + # GENERAL PROPERTIES -project_name = 'Test_CNN' -project_path = './' +project_name = args.project_name +project_path = args.project_path proj_folder = project_path + project_name + '/' @@ -87,6 +108,8 @@ # Data type list for layer-by-layer deployment (mixed precision) data_type_list = ['FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16'] #data_type_list = ['FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32'] +# Placeholder for pretrained parameters +weight_list = [] # Data layout list (CHW or HWC) data_layout_list = ['CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW'] # TO DO # Bias @@ -111,7 +134,7 @@ PRINT_TRAIN_LOSS = True # Set to true if you want to print the train loss for each epoch # OTHER PROPERTIES # Select if to read the network from an external source -READ_MODEL_ARCH = False # NOT IMPLEMENTED!! +READ_MODEL_ARCH = args.model_path is not None # --------------------------- # --- END OF USER SETTING --- @@ -123,9 +146,311 @@ BACKEND """ +class ONNXGraphParser: + def __init__(self, onnx_model): + onnx.checker.check_model(onnx_model) + self.model = shape_inference.infer_shapes(onnx_model) + self.graph = self.model.graph + self.value_info_lookup = {v.name: i for i, v in enumerate(self.graph.value_info)} + self.node_lookup = {v.name: i for i, v in enumerate(self.graph.node)} + self.input_lookup = {v.name: i for i, v in enumerate(self.graph.input)} + self.output_lookup = {v.name: i for i, v in enumerate(self.graph.output)} + self.init_lookup = {v.name: i for i, v in enumerate(self.graph.initializer)} + self.get_precision() # make sure precision is fine + + def _get_type(self, node_name): + if node_name in self.value_info_lookup: + return self.graph.value_info[self.value_info_lookup[node_name]].type + elif node_name in self.input_lookup: + return self.graph.input[self.input_lookup[node_name]].type + elif node_name in self.output_lookup: + return self.graph.output[self.output_lookup[node_name]].type + else: + raise KeyError(f"Node {node_name} not found") + + def _get_node_attr(self, node_name, attr): + for a in self.graph.node[self.node_lookup[node_name]].attribute: + if a.name == attr: + return a + else: + raise ValueError(f"Node {node_name} has no {attr} attribute") + + def is_pointwise(self, node_name): + hk, wk = self.get_kernel_size(node_name) + return hk == wk == 1 + + def is_depthwise(self, node_name): + node = self.graph.node[self.node_lookup[node_name]] + try: + groups = self._get_node_attr(node_name, "group").i + except ValueError: + return False + in_ch = self.get_channel_count(node.input[0]) + out_ch = self.get_channel_count(node.output[0]) + if groups <= 1: + return False + assert in_ch == out_ch == groups, "For depthwise convolutions, input and output channels must be the same as groups" + return True + + def get_channel_count(self, node_name): + tensor_type = self._get_type(node_name) + # Data layout is B, C, H, W + return tensor_type.tensor_type.shape.dim[1].dim_value + + def get_hw(self, node_name): + tensor_type = self._get_type(node_name) + shape = tensor_type.tensor_type.shape.dim + return shape[2].dim_value, shape[3].dim_value + + def get_activation_size(self, node_name): + tensor_type = self._get_type(node_name) + dims = tensor_type.tensor_type.shape.dim + # Data layout is B, C, H, W + return dims[2].dim_value, dims[3].dim_value + + def get_init(self, node_name): + index = self.init_lookup.get(node_name, -1) + if index == -1: + raise KeyError(f"Node {node_name} has no initializer") + init = self.graph.initializer[index] + return numpy_helper.to_array(init) + + def get_kernel_size(self, node_name): + ksize = self._get_node_attr(node_name, "kernel_shape") + return ksize.ints[0], ksize.ints[1] + + def get_stride(self, node_name): + stride = self._get_node_attr(node_name, "strides") + return stride.ints[0], stride.ints[1] + + def get_pad(self, node_name): + pad = self._get_node_attr(node_name, "pads").ints + assert pad[0] == pad[2] and pad[1] == pad[3], "Only symmetric padding is supported." + return pad[0], pad[1] + + def get_precision(self): + elem_type = self.graph.value_info[0].type.tensor_type.elem_type + if elem_type == onnx.TensorProto.FLOAT: + return "FP32" + elif elem_type == onnx.TensorProto.FLOAT16: + return "FP16" + elif elem_type == onnx.TensorProto.BFLOAT16: + raise NotImplementedError("Numpy does not support bfloat16 and converts it to FP32. We need to change how we save and load weights.") + else: + raise ValueError("Only FP32 and FP16 are supported") + + # Call the DNN Reader and then the DNN Composer if READ_MODEL_ARCH : - pass + + + layer_list = [] + in_ch_list = [] + out_ch_list = [] + hk_list = [] + wk_list = [] + hin_list = [] + win_list = [] + h_pad_list = [] + w_pad_list = [] + opt_mm_fw_list = [] + opt_mm_wg_list = [] + opt_mm_ig_list = [] + data_type_list = [] + data_layout_list = [] + + if (args.model_path.split('.')[-1] == "onnx"): + onnx_model = onnx.load(args.model_path) + onnx.checker.check_model(onnx_model) + graph = ONNXGraphParser(onnx_model) + found_start = args.start_at is None + + if args.start_at is not None: + node_names = [n.op_type for n in graph.graph.node if n.op_type != 'Constant'] + assert args.start_at in node_names, f"{args.start_at} is not a valid layer name. Layer names are: {node_names}" + # CIOFLANC: temporary reconciling pseudo-sparse update implementations + update_layer_list = [1] * (len(node_names) - node_names.index(args.start_at)) + # update_layer_list[node_names.index(args.start_at)] = 1 + + for onnx_node in graph.graph.node: + + if not found_start: + if onnx_node.op_type != args.start_at: + continue + else: + found_start = True + + if (onnx_node.op_type == 'Gemm') or (onnx_node.op_type == 'MatMul'): + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + layer_list.append('linear') + hk_list.append(1) + wk_list.append(1) + hin_list.append(1) + win_list.append(1) + h_str_list.append(1) + w_str_list.append(1) + h_pad_list.append(0) + w_pad_list.append(0) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + # TODO: Read from file + data_type_list.append(graph.get_precision()) + # TODO: Read from file + # Note that this also determines the read position for in_ch_list and out_ch_list + data_layout_list.append('CHW') + weight_init = graph.get_init(onnx_node.input[1]) + # Gemm node does y = x*B, but torch uses y = A*x, so transpose B to get A back + # This also aligns with how trainlib does things + weight_init = weight_init.transpose(1,0) + try: + bias_init = graph.get_init(onnx_node.input[2]) + raise NotImplementedError("Biases are not implemented in trainlib") + except (KeyError, IndexError): + bias_init = [] + weight_list.append((weight_init, bias_init)) + sumnode_connections.append(0) + elif onnx_node.op_type == 'AveragePool': + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + layer_list.append('AvgPool') + (hk, wk) = graph.get_kernel_size(onnx_node.op_type) + hk_list.append(hk) + wk_list.append(wk) + (hin, win) = graph.get_activation_size(onnx_node.input[0]) + hin_list.append(hin) + win_list.append(win) + (hstr, wstr) = graph.get_stride(onnx_node.op_type) + h_str_list.append(hstr) + w_str_list.append(wstr) + (hpad, wpad) = graph.get_pad(onnx_node.op_type) + h_pad_list.append(hpad) + w_pad_list.append(wpad) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + data_type_list.append(graph.get_precision()) + data_layout_list.append('CHW') + weight_list.append(([], [])) # kernels + sumnode_connections.append(0) + elif onnx_node.op_type == 'GlobalAveragePool': + hk, wk = graph.get_hw(onnx_node.input[0]) + if hk == 1 and wk == 1: + # There is nothing to average, skip this node + continue + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + layer_list.append('AvgPool') + hk_list.append(hk) + wk_list.append(wk) + (hin, win) = graph.get_activation_size(onnx_node.input[0]) + hin_list.append(hin) + win_list.append(win) + h_str_list.append(1) + w_str_list.append(1) + h_pad_list.append(0) + w_pad_list.append(0) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + data_type_list.append(graph.get_precision()) + data_layout_list.append('CHW') + weight_list.append(([], [])) # kernels + sumnode_connections.append(0) + elif onnx_node.op_type == 'Conv': + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + if graph.is_pointwise(onnx_node.op_type): + ty = "PW" + elif graph.is_depthwise(onnx_node.op_type): + ty = "DW" + else: + ty = "conv2d" + layer_list.append(ty) + (hk, wk) = graph.get_kernel_size(onnx_node.op_type) + hk_list.append(hk) + wk_list.append(wk) + (hin, win) = graph.get_activation_size(onnx_node.input[0]) + hin_list.append(hin) + win_list.append(win) + (hstr, wstr) = graph.get_stride(onnx_node.op_type) + h_str_list.append(hstr) + w_str_list.append(wstr) + (hpad, wpad) = graph.get_pad(onnx_node.op_type) + h_pad_list.append(hpad) + w_pad_list.append(wpad) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + data_type_list.append(graph.get_precision()) + # TODO: Read from file + # Note that this also determines the read position for in_ch_list and out_ch_list + data_layout_list.append('CHW') + weight_init = graph.get_init(onnx_node.input[1]) + try: + bias_init = graph.get_init(onnx_node.input[2]) + raise NotImplementedError("Biases are not implemented in trainlib") + except (KeyError, IndexError): + # Ignore missing bias + bias_init = [] + pass + weight_list.append((weight_init, bias_init)) # kernels + sumnode_connections.append(0) + elif onnx_node.op_type == 'Clip': + # This does not handle ReLU6, as it is not supported by trainlib + layer_list.append('ReLU') + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + hk_list.append(1) + wk_list.append(1) + hin_list.append(1) + win_list.append(1) + h_str_list.append(1) + w_str_list.append(1) + h_pad_list.append(0) + w_pad_list.append(0) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + data_layout_list.append('CHW') + data_type_list.append(graph.get_precision()) + weight_list.append(([], [])) + sumnode_connections.append(0) + else: + raise NotImplementedError("Model format not supported.") + + data_dir = proj_folder+'data/' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + for i, (weight_init, bias_init) in enumerate(weight_list): + np.save(data_dir+f"l{i}w.npy", np.array(weight_list[i][0], dtype=("float32" if data_type_list[i] == "FP32" else "float16"))) + np.save(data_dir+f"l{i}b.npy", np.array(weight_list[i][1], dtype=("float32" if data_type_list[i] == "FP32" else "float16"))) + + print("Generating project at location "+proj_folder) + + # Check if Residual Connections are valid + sumnode_connections = composer.AdjustResConnList(sumnode_connections) + + composer.CheckResConn(layer_list, in_ch_list, out_ch_list, hin_list, win_list, sumnode_connections, update_layer_list) + + # Check if the network training fits L1 + memocc = composer.DNN_Size_Checker(layer_list, in_ch_list, out_ch_list, hk_list, wk_list, hin_list, win_list, + h_str_list, w_str_list, h_pad_list, w_pad_list, + data_type_list, bias_list, update_layer_list, + L1_SIZE_BYTES, USE_DMA, CONV2D_USE_IM2COL) + + print("DNN memory occupation: {} bytes of {} available L1 bytes ({}%).".format(memocc, L1_SIZE_BYTES, (memocc/L1_SIZE_BYTES)*100)) + + # Call DNN Composer on the user-provided graph + composer.DNN_Composer(proj_folder, project_name, + layer_list, in_ch_list, out_ch_list, hk_list, wk_list, + hin_list, win_list, h_str_list, w_str_list, h_pad_list, w_pad_list, + epochs, batch_size, learning_rate, optimizer, loss_fn, + NUM_CORES, data_type_list, weight_list, bias_list, update_layer_list, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, sumnode_connections, + USE_DMA, PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS) + + print("PULP project generation successful!") else: @@ -151,9 +476,9 @@ layer_list, in_ch_list, out_ch_list, hk_list, wk_list, hin_list, win_list, h_str_list, w_str_list, h_pad_list, w_pad_list, epochs, batch_size, learning_rate, optimizer, loss_fn, - NUM_CORES, data_type_list, bias_list, update_layer_list, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, + NUM_CORES, data_type_list, weight_list, bias_list, update_layer_list, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, sumnode_connections, USE_DMA, PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS) print("PULP project generation successful!") - pass + pass \ No newline at end of file diff --git a/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py b/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py index 95b5376e..440e9a07 100644 --- a/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py +++ b/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py @@ -1,5 +1,5 @@ ''' -Copyright (C) 2021-2022 ETH Zurich and University of Bologna +Copyright (C) 2021-2024 ETH Zurich and University of Bologna Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ ''' ''' -Authors: Davide Nadalini +Authors: Davide Nadalini, Cristian Cioflan, Axel Vanoni ''' import deployer_utils.deployment_utils_single_buffer as utilsSB @@ -213,7 +213,7 @@ def DNN_Composer (proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - NUM_CORES, data_type_l, bias_l, update_layer_l, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, + NUM_CORES, data_type_l, weight_l, bias_l, update_layer_l, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, sumnode_connections, USE_DMA, PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS): # Initialize project (copy the prefab files and create folder) @@ -227,7 +227,7 @@ def DNN_Composer (proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - data_type_l, bias_l, update_layer_l, sumnode_connections, USE_DMA) + data_type_l, weight_l, bias_l, update_layer_l, sumnode_connections, USE_DMA) global MAX_LAYER_DIM diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py index 2e302484..5f861c7b 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py @@ -1,5 +1,5 @@ ''' -Copyright (C) 2021-2022 ETH Zurich and University of Bologna +Copyright (C) 2021-2024 ETH Zurich and University of Bologna Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ ''' ''' -Authors: Davide Nadalini, Giacomo Saporetti +Authors: Davide Nadalini, Giacomo Saporetti, Cristian Cioflan, Axel Vanoni ''' import os @@ -250,8 +250,8 @@ def InitProject(proj_folder_path): utils_folder = proj_folder + 'utils/' trainlib_dest_folder = proj_folder + 'lib/' - os.mkdir(proj_folder) - os.mkdir(utils_folder) + os.makedirs(proj_folder, exist_ok = True) + os.makedirs(utils_folder, exist_ok = True) shutil.copy2('./deployer_utils/srcfiles/main.c', proj_folder) shutil.copy2('./deployer_utils/srcfiles/stats.h', proj_folder) @@ -289,7 +289,7 @@ def GenerateMakefile(proj_folder_path, project_name, layers_l, NUM_CORES, data_t f.write('MATMUL_TYPE_IG_L'+str(layer)+'?='+str(opt_mm_ig_list[layer])+' # Selects which optimized matmul to be used in IN GRAD (see mm_manager_list.txt or "MM_manager()" body to verify which one is called)' + '\n') f.write('# End of user settings\n\n') - f.write('NUM_MATMULS?=24 # Available standard matmuls in the library' + '\n') + f.write('NUM_MATMULS?=24 # Available standard matmuls in the library' + '\n') f.write('TRAIN_LIB=./lib\n') f.write('TRAIN_LIB_SRCS=$(TRAIN_LIB)/sources\n') f.write('APP_SRCS = main.c net.c\n\n') @@ -370,7 +370,7 @@ def GenerateGM(proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - data_type_l, bias_l, update_layer_l, sumnode_connections, USE_DMA): + data_type_l, weight_l, bias_l, update_layer_l, sumnode_connections, USE_DMA): # Check if GPU is available, else keep fake FP16 cuda_is_on = torch.cuda.is_available() @@ -390,6 +390,7 @@ def GenerateGM(proj_folder_path, project_name, f.write("import torch.optim as optim\n") f.write("import dump_utils as dump\n") f.write("import math\n") + f.write("import numpy\n") f.write("\n") f.write("# Set device\n") @@ -574,7 +575,18 @@ def GenerateGM(proj_folder_path, project_name, f.write("\n# Initialize network\n") f.write("net = DNN().to(device)\n") f.write("for p in net.parameters():\n") - f.write("\tnn.init.normal_(p, mean=0.0, std=0.01)\n") + f.write("\tnn.init.normal_(p, mean=0.0, std=1.0)\n") + if (weight_l): + f.write("from pathlib import Path\n") + f.write("basedir = Path(__file__).resolve().parent.parent\n") + for layer in range(len(layers_l)): + if data_type_l[layer] == 'FP16': + to = ".to(device).half()" + else: + to = ".to(device)" + f.write(f"net.l{layer}.weight = torch.nn.Parameter(torch.from_numpy(numpy.load(basedir / 'data/l{layer}w.npy')){to}, requires_grad=True)\n") + # TODO: uncomment once biases are implemented in trainlib + # f.write(f"net.l{layer}.bias = torch.nn.Parameter(torch.from_numpy(numpy.load(basedir / 'data/l{layer}b.npy')){to}, requires_grad=True)\n") f.write("net.zero_grad()\n\n") # Freeze layers excluded from sparse update @@ -1805,5 +1817,4 @@ def GenerateNet(proj_folder_path, project_name, f.close() - return - + return \ No newline at end of file diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_double_buffer.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_double_buffer.py index 11c7963f..d8725182 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_double_buffer.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_double_buffer.py @@ -81,7 +81,7 @@ def GenerateNet(proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - data_type_l, bias_l, update_layer_l, sumnode_connections, MAX_LAYER_DIM, + data_type_l, weight_l, bias_l, update_layer_l, sumnode_connections, MAX_LAYER_DIM, PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS): @@ -159,7 +159,8 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n// Define structures and pointers to data in L1 memory\n") if data_type == 'FP32': #f.write("PI_L1 float * D0, * d0, * W0, * w0, * D1, * d1, * W1, *w1;\n") - f.write("PI_L1 float BUFF[MAX_SIZE];\n") + # f.write("PI_L1 float BUFF[MAX_SIZE];\n") + f.write("PI_L1 float * BUFF;\n") f.write("PI_L1 struct blob d1_blob;\n") f.write("PI_L1 struct blob w1_blob;\n") f.write("PI_L1 struct blob b1_blob;\n") @@ -182,7 +183,8 @@ def GenerateNet(proj_folder_path, project_name, #f.write("PI_L1 float * t;\n") elif data_type == 'FP16': f.write("PI_L1 fp16 * D1, * d1, * W1, * w1, * D0, * d0, * W0, *w0;\n") - f.write("PI_L1 fp16 BUFF[MAX_SIZE];\n") + # f.write("PI_L1 fp16 BUFF[MAX_SIZE];\n") + f.write("PI_L1 fp16 * BUFF;\n") f.write("PI_L1 struct blob_fp16 d1_blob;\n") f.write("PI_L1 struct blob_fp16 w1_blob;\n") f.write("PI_L1 struct blob_fp16 d0_blob;\n") @@ -398,7 +400,7 @@ def GenerateNet(proj_folder_path, project_name, im2col_byte_length = 0 im2col_max_data_type = 'FP32' for layer in range(len(layers_l)): - if layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True: # or layers_l[layer] == 'DW': + if layers_l[layer] == 'conv2d': # or layers_l[layer] == 'DW': if data_type_l[layer] == 'FP32': im2col_byte_length = 4 elif data_type_l[layer] == 'FP16': @@ -436,6 +438,7 @@ def GenerateNet(proj_folder_path, project_name, else: print("[deployment_utils.GenerateNet] Invalid data type for im2col!!") exit() + # No im2col buffer allocate_no_im2col = False for layer in range(len(layers_l)): @@ -453,10 +456,10 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): # Check layer data layout data_layout = 'CHW' # Change to input list of data layouts - if ((layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True) or layers_l[layer] == 'PW') and layer == 0: + if (layers_l[layer] == 'conv2d' or layers_l[layer] == 'PW') and layer == 0: bt_flag = True bt_layer_index = 0 - elif ((layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True) or layers_l[layer] == 'PW') and layer > 0: + elif (layers_l[layer] == 'conv2d' or layers_l[layer] == 'PW') and layer > 0: bt_flag = True bt_mem = in_ch_l[layer] * hk_l[layer] * wk_l[layer] * out_ch_l[layer] if bt_mem > bt_max_memocc: @@ -497,13 +500,7 @@ def GenerateNet(proj_folder_path, project_name, else: print("[deployment_utils.GenerateNet] Invalid data type for pw transp buffer definition!\n") exit() - # No blocktranspose buffer - if (bt_flag == False): - print("No blockstranspose buffer detected\n") - f.write("PI_L1 float bt_buffer[1];\n") - # Define label buffer - f.write("PI_L1 float label_temp[Tout_C_l"+str(len(layers_l)-1)+"*Tout_H_l"+str(len(layers_l)-1)+"*Tout_W_l"+str(len(layers_l)-1)+"];\n") # Define tensors to backpropagate the output error f.write("\n// Define error propagation tensors\n") @@ -594,6 +591,10 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n// DNN initialization function\n") f.write("void DNN_init()\n{\n") f.write("\n// Assign pointers in L1\n") + if (data_type == "FP32"): + f.write(" BUFF = (float *) pi_l1_malloc(NULL, MAX_SIZE*sizeof(float));\n") + elif (data_type == "FP16"): + f.write(" BUFF = (fp16 *) pi_l1_malloc(NULL, MAX_SIZE*sizeof(float));\n") f.write(" d0_blob.data = BUFF;\n") f.write(" d0_blob.diff = BUFF;\n") f.write(" w0_blob.data = BUFF;\n") @@ -872,16 +873,16 @@ def GenerateNet(proj_folder_path, project_name, skip_inputgrad = 0 # Write configuration templates if layers_l[layer] == 'linear': - f.write(ntemp.linear_config_template(layer, skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.linear_config_template(layer, skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'conv2d': IM2COL_USEIT = 1 if CONV2D_USE_IM2COL == False: IM2COL_USEIT = 0 f.write(ntemp.conv2d_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer], bias_l[layer], IM2COL_USEIT, 1)) elif layers_l[layer] == 'PW': - f.write(ntemp.PW_config_template(layer, skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.PW_config_template(layer, skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'DW': - f.write(ntemp.DW_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.DW_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'ReLU': f.write(ntemp.ReLU_config_template(layer, data_type_l[layer])) elif layers_l[layer] == 'MaxPool': @@ -1312,25 +1313,7 @@ def GenerateNet(proj_folder_path, project_name, elif data_type_l[-1] == 'FP16': f.write(" pulp_MSELoss_fp16(&loss_args);\n") else: - print("[deployment_utils.GenerateNet]: Invalid loss type!") - exit() - f.write(f"\tstore((uint32_t) out.diff, (uint32_t) layer{len(layers_l)-1}_out.diff, {float_size}*OUT_SIZE);\n") - elif loss_fn == "CrossEntropyLoss": - float_size = 2 - if data_type_l[0] == 'FP32': - float_size = 4 - f.write("\tloss_args.output = &out;\n") - f.write("\tloss_args.target = out.diff;\n") - f.write("\tloss_args.wr_loss = &loss;\n") - f.write(f"\tload((uint32_t) LABEL, (uint32_t) out.diff, {float_size}*OUT_SIZE);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load);\n") - - if data_type_l[-1] == 'FP32': - f.write(" pulp_CrossEntropyLoss(&loss_args);\n") - elif data_type_l[-1] == 'FP16': - f.write(" pulp_CrossEntropyLoss_fp16(&loss_args);\n") - else: - print("[deployment_utils.GenerateNet]: Invalid loss type!") + print("[deplyment_utils.GenerateNet]: Invalid loss type!") exit() f.write(f"\tstore((uint32_t) out.diff, (uint32_t) layer{len(layers_l)-1}_out.diff, {float_size}*OUT_SIZE);\n") else: @@ -1520,11 +1503,6 @@ def GenerateNet(proj_folder_path, project_name, f.write(" for (int epoch=0; epoch>> EPOCH %d: train_loss = %f (GM: %f)\\n\", epoch, loss, TRAIN_LOSS[epoch]);\n") - f.write(" /* Continue profiling */ pi_perf_start();\n") f.write(" backward();\n") f.write(" update_weights();\n") f.write(" }\n\n") @@ -1541,6 +1519,9 @@ def GenerateNet(proj_folder_path, project_name, f.write(" check_post_training_output();\n") f.write(" print_output();\n") + f.write(" // Free l1 buffer\n") + f.write(" pi_l1_free(NULL, BUFF, MAX_SIZE*sizeof(float));\n") + f.write("}\n") data_size = 0 diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py index faf7677f..ce4252de 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py @@ -1798,5 +1798,4 @@ def GenerateNet(proj_folder_path, project_name, - return - + return \ No newline at end of file