From 07566995ac3912c9d5dacd57f76d173a5188bad2 Mon Sep 17 00:00:00 2001 From: Cristian Cioflan Date: Thu, 20 Jun 2024 11:36:57 +0200 Subject: [PATCH 1/6] Added ONNX parsing. Added argparse. TODO: Align start_at and update_layer_list --- tools/TrainLib_Deployer/TrainLib_Deployer.py | 326 +++++++++++++++++- .../deployer_utils/DNN_Composer.py | 10 +- .../deployer_utils/deployment_utils.py | 18 +- 3 files changed, 340 insertions(+), 14 deletions(-) diff --git a/tools/TrainLib_Deployer/TrainLib_Deployer.py b/tools/TrainLib_Deployer/TrainLib_Deployer.py index 1747bdba..eacf670b 100644 --- a/tools/TrainLib_Deployer/TrainLib_Deployer.py +++ b/tools/TrainLib_Deployer/TrainLib_Deployer.py @@ -1,5 +1,5 @@ ''' -Copyright (C) 2021-2022 ETH Zurich and University of Bologna +Copyright (C) 2021-2024 ETH Zurich and University of Bologna Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ ''' ''' -Authors: Davide Nadalini +Authors: Davide Nadalini, Cristian Cioflan, Axel Vanoni ''' """ @@ -48,9 +48,24 @@ # --- USER SETTINGS --- # --------------------- +parser = argparse.ArgumentParser( + prog='Deployer', + description='Generating C code for on-device training') + +parser = argparse.ArgumentParser() +parser._action_groups.pop() +required = parser.add_argument_group('required arguments') +optional = parser.add_argument_group('optional arguments') +required.add_argument('--project_name', type=str, default="Test_CNN", help='Project name', required=True) +required.add_argument('--project_path', type=str, default="./", help='Project path', required=True) +optional.add_argument('--model_path', type=str, default=None, help='Pretrained model path') +optional.add_argument('--start_at', type=str, default=None, help='At which node to start generating') +args = parser.parse_args() + + # GENERAL PROPERTIES -project_name = 'Test_CNN' -project_path = './' +project_name = args.project_name +project_path = args.project_path proj_folder = project_path + project_name + '/' # TRAINING PROPERTIES @@ -86,6 +101,8 @@ # Data type list for layer-by-layer deployment (mixed precision) #data_type_list = ['FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16', 'FP16'] data_type_list = ['FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32', 'FP32'] +# Placeholder for pretrained parameters +data_list = [] # Data layout list (CHW or HWC) data_layout_list = ['CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW'] # TO DO # Sparse Update @@ -122,9 +139,306 @@ BACKEND """ +class ONNXGraphParser: + def __init__(self, onnx_model): + onnx.checker.check_model(onnx_model) + self.model = shape_inference.infer_shapes(onnx_model) + self.graph = self.model.graph + self.value_info_lookup = {v.name: i for i, v in enumerate(self.graph.value_info)} + self.node_lookup = {v.name: i for i, v in enumerate(self.graph.node)} + self.input_lookup = {v.name: i for i, v in enumerate(self.graph.input)} + self.output_lookup = {v.name: i for i, v in enumerate(self.graph.output)} + self.init_lookup = {v.name: i for i, v in enumerate(self.graph.initializer)} + self.get_precision() # make sure precision is fine + + def _get_type(self, node_name): + if node_name in self.value_info_lookup: + return self.graph.value_info[self.value_info_lookup[node_name]].type + elif node_name in self.input_lookup: + return self.graph.input[self.input_lookup[node_name]].type + elif node_name in self.output_lookup: + return self.graph.output[self.output_lookup[node_name]].type + else: + raise KeyError(f"Node {node_name} not found") + + def _get_node_attr(self, node_name, attr): + for a in self.graph.node[self.node_lookup[node_name]].attribute: + if a.name == attr: + return a + else: + raise ValueError(f"Node {node_name} has no {attr} attribute") + + def is_pointwise(self, node_name): + hk, wk = self.get_kernel_size(node_name) + return hk == wk == 1 + + def is_depthwise(self, node_name): + node = self.graph.node[self.node_lookup[node_name]] + try: + groups = self._get_node_attr(node_name, "group").i + except ValueError: + return False + in_ch = self.get_channel_count(node.input[0]) + out_ch = self.get_channel_count(node.output[0]) + if groups <= 1: + return False + assert in_ch == out_ch == groups, "For depthwise convolutions, input and output channels must be the same as groups" + return True + + def get_channel_count(self, node_name): + tensor_type = self._get_type(node_name) + # Data layout is B, C, H, W + return tensor_type.tensor_type.shape.dim[1].dim_value + + def get_hw(self, node_name): + tensor_type = self._get_type(node_name) + shape = tensor_type.tensor_type.shape.dim + return shape[2].dim_value, shape[3].dim_value + + def get_activation_size(self, node_name): + tensor_type = self._get_type(node_name) + dims = tensor_type.tensor_type.shape.dim + # Data layout is B, C, H, W + return dims[2].dim_value, dims[3].dim_value + + def get_init(self, node_name): + index = self.init_lookup.get(node_name, -1) + if index == -1: + raise KeyError(f"Node {node_name} has no initializer") + init = self.graph.initializer[index] + return numpy_helper.to_array(init) + + def get_kernel_size(self, node_name): + ksize = self._get_node_attr(node_name, "kernel_shape") + return ksize.ints[0], ksize.ints[1] + + def get_stride(self, node_name): + stride = self._get_node_attr(node_name, "strides") + return stride.ints[0], stride.ints[1] + + def get_pad(self, node_name): + pad = self._get_node_attr(node_name, "pads").ints + assert pad[0] == pad[2] and pad[1] == pad[3], "Only symmetric padding is supported." + return pad[0], pad[1] + + def get_precision(self): + elem_type = self.graph.value_info[0].type.tensor_type.elem_type + if elem_type == onnx.TensorProto.FLOAT: + return "FP32" + elif elem_type == onnx.TensorProto.FLOAT16: + return "FP16" + elif elem_type == onnx.TensorProto.BFLOAT16: + raise NotImplementedError("Numpy does not support bfloat16 and converts it to FP32. We need to change how we save and load weights.") + else: + raise ValueError("Only FP32 and FP16 are supported") + + # Call the DNN Reader and then the DNN Composer if READ_MODEL_ARCH : - pass + + + layer_list = [] + in_ch_list = [] + out_ch_list = [] + hk_list = [] + wk_list = [] + hin_list = [] + win_list = [] + h_pad_list = [] + w_pad_list = [] + opt_mm_fw_list = [] + opt_mm_wg_list = [] + opt_mm_ig_list = [] + data_type_list = [] + data_layout_list = [] + + if (args.model_path.split('.')[-1] == "onnx"): + onnx_model = onnx.load(args.model_path) + onnx.checker.check_model(onnx_model) + graph = ONNXGraphParser(onnx_model) + found_start = args.start_at is None + + if args.start_at is not None: + node_names = [n.name for n in graph.graph.node if n.op_type != 'Constant'] + assert args.start_at in node_names, f"{args.start_at} is not a valid layer name. Layer names are: {node_names}" + + for onnx_node in graph.graph.node: + if not found_start: + if onnx_node.name != args.start_at: + continue + else: + found_start = True + + if (onnx_node.op_type == 'Gemm') or (onnx_node.op_type == 'MatMul'): + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + layer_list.append('linear') + hk_list.append(1) + wk_list.append(1) + hin_list.append(1) + win_list.append(1) + h_str_list.append(1) + w_str_list.append(1) + h_pad_list.append(0) + w_pad_list.append(0) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + # TODO: Read from file + data_type_list.append(graph.get_precision()) + # TODO: Read from file + # Note that this also determines the read position for in_ch_list and out_ch_list + data_layout_list.append('CHW') + weight_init = graph.get_init(onnx_node.input[1]) + # Gemm node does y = x*B, but torch uses y = A*x, so transpose B to get A back + # This also aligns with how trainlib does things + weight_init = weight_init.transpose(1,0) + try: + bias_init = graph.get_init(onnx_node.input[2]) + raise NotImplementedError("Biases are not implemented in trainlib") + except (KeyError, IndexError): + bias_init = [] + data_list.append((weight_init, bias_init)) + sumnode_connections.append(0) + elif onnx_node.op_type == 'AveragePool': + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + layer_list.append('AvgPool') + (hk, wk) = graph.get_kernel_size(onnx_node.name) + hk_list.append(hk) + wk_list.append(wk) + (hin, win) = graph.get_activation_size(onnx_node.input[0]) + hin_list.append(hin) + win_list.append(win) + (hstr, wstr) = graph.get_stride(onnx_node.name) + h_str_list.append(hstr) + w_str_list.append(wstr) + (hpad, wpad) = graph.get_pad(onnx_node.name) + h_pad_list.append(hpad) + w_pad_list.append(wpad) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + data_type_list.append(graph.get_precision()) + data_layout_list.append('CHW') + data_list.append(([], [])) # kernels + sumnode_connections.append(0) + elif onnx_node.op_type == 'GlobalAveragePool': + hk, wk = graph.get_hw(onnx_node.input[0]) + if hk == 1 and wk == 1: + # There is nothing to average, skip this node + continue + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + layer_list.append('AvgPool') + hk_list.append(hk) + wk_list.append(wk) + (hin, win) = graph.get_activation_size(onnx_node.input[0]) + hin_list.append(hin) + win_list.append(win) + h_str_list.append(1) + w_str_list.append(1) + h_pad_list.append(0) + w_pad_list.append(0) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + data_type_list.append(graph.get_precision()) + data_layout_list.append('CHW') + data_list.append(([], [])) # kernels + sumnode_connections.append(0) + elif onnx_node.op_type == 'Conv': + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + if graph.is_pointwise(onnx_node.name): + ty = "PW" + elif graph.is_depthwise(onnx_node.name): + ty = "DW" + else: + ty = "conv2d" + layer_list.append(ty) + (hk, wk) = graph.get_kernel_size(onnx_node.name) + hk_list.append(hk) + wk_list.append(wk) + (hin, win) = graph.get_activation_size(onnx_node.input[0]) + hin_list.append(hin) + win_list.append(win) + (hstr, wstr) = graph.get_stride(onnx_node.name) + h_str_list.append(hstr) + w_str_list.append(wstr) + (hpad, wpad) = graph.get_pad(onnx_node.name) + h_pad_list.append(hpad) + w_pad_list.append(wpad) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + data_type_list.append(graph.get_precision()) + # TODO: Read from file + # Note that this also determines the read position for in_ch_list and out_ch_list + data_layout_list.append('CHW') + weight_init = graph.get_init(onnx_node.input[1]) + try: + bias_init = graph.get_init(onnx_node.input[2]) + raise NotImplementedError("Biases are not implemented in trainlib") + except (KeyError, IndexError): + # Ignore missing bias + bias_init = [] + pass + data_list.append((weight_init, bias_init)) # kernels + sumnode_connections.append(0) + elif onnx_node.op_type == 'Clip': + # This does not handle ReLU6, as it is not supported by trainlib + layer_list.append('ReLU') + in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) + out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) + hk_list.append(1) + wk_list.append(1) + hin_list.append(1) + win_list.append(1) + h_str_list.append(1) + w_str_list.append(1) + h_pad_list.append(0) + w_pad_list.append(0) + opt_mm_fw_list.append(0) + opt_mm_wg_list.append(0) + opt_mm_ig_list.append(0) + data_layout_list.append('CHW') + data_type_list.append(graph.get_precision()) + data_list.append(([], [])) + sumnode_connections.append(0) + else: + raise NotImplementedError("Model format not supported.") + + data_dir = proj_folder+'data/' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + for i, (weight_init, bias_init) in enumerate(data_list): + np.save(data_dir+f"l{i}w.npy", np.array(data_list[i][0], dtype=("float32" if data_type_list[i] == "FP32" else "float16"))) + np.save(data_dir+f"l{i}b.npy", np.array(data_list[i][1], dtype=("float32" if data_type_list[i] == "FP32" else "float16"))) + + print("Generating project at location "+proj_folder) + + # Check if Residual Connections are valid + sumnode_connections = composer.AdjustResConnList(sumnode_connections) + + composer.CheckResConn(layer_list, in_ch_list, out_ch_list, hin_list, win_list, sumnode_connections, update_layer_list) + + # Check if the network training fits L1 + memocc = composer.DNN_Size_Checker(layer_list, in_ch_list, out_ch_list, hk_list, wk_list, hin_list, win_list, + h_str_list, w_str_list, h_pad_list, w_pad_list, + data_type_list, update_layer_list, L1_SIZE_BYTES, USE_DMA, CONV2D_USE_IM2COL) + + print("DNN memory occupation: {} bytes of {} available L1 bytes ({}%).".format(memocc, L1_SIZE_BYTES, (memocc/L1_SIZE_BYTES)*100)) + + # Call DNN Composer on the user-provided graph + composer.DNN_Composer(proj_folder, project_name, + layer_list, in_ch_list, out_ch_list, hk_list, wk_list, + hin_list, win_list, h_str_list, w_str_list, h_pad_list, w_pad_list, + epochs, batch_size, learning_rate, optimizer, loss_fn, + NUM_CORES, data_type_list, data_list, update_layer_list, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, sumnode_connections, + USE_DMA, PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS) + + print("PULP project generation successful!") else: @@ -150,7 +464,7 @@ layer_list, in_ch_list, out_ch_list, hk_list, wk_list, hin_list, win_list, h_str_list, w_str_list, h_pad_list, w_pad_list, epochs, batch_size, learning_rate, optimizer, loss_fn, - NUM_CORES, data_type_list, update_layer_list, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, + NUM_CORES, data_type_list, data_list, update_layer_list, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, sumnode_connections, USE_DMA, PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS) print("PULP project generation successful!") diff --git a/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py b/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py index 0b4b9a82..ea36ac7e 100644 --- a/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py +++ b/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py @@ -1,5 +1,5 @@ ''' -Copyright (C) 2021-2022 ETH Zurich and University of Bologna +Copyright (C) 2021-2024 ETH Zurich and University of Bologna Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ ''' ''' -Authors: Davide Nadalini +Authors: Davide Nadalini, Cristian Cioflan, Axel Vanoni ''' import deployer_utils.deployment_utils_single_buffer as utilsSB @@ -210,7 +210,7 @@ def DNN_Composer (proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - NUM_CORES, data_type_l, update_layer_l, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, + NUM_CORES, data_type_l, data_list, update_layer_l, opt_mm_fw_list, opt_mm_wg_list, opt_mm_ig_list, sumnode_connections, USE_DMA, PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS): # Initialize project (copy the prefab files and create folder) @@ -224,7 +224,7 @@ def DNN_Composer (proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - data_type_l, update_layer_l, sumnode_connections, USE_DMA) + data_type_l, data_list, update_layer_l, sumnode_connections, USE_DMA) global MAX_LAYER_DIM @@ -258,4 +258,4 @@ def DNN_Composer (proj_folder_path, project_name, else: print(f"[DNN_Composer]: Not supported argument for USE_DMA: '{USE_DMA}' given") - return \ No newline at end of file + return diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py index 25b13556..6ce9a60c 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py @@ -1,5 +1,5 @@ ''' -Copyright (C) 2021-2022 ETH Zurich and University of Bologna +Copyright (C) 2021-2024 ETH Zurich and University of Bologna Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ ''' ''' -Authors: Davide Nadalini, Giacomo Saporetti +Authors: Davide Nadalini, Giacomo Saporetti, Cristian Cioflan, Axel Vanoni ''' import os @@ -364,7 +364,7 @@ def GenerateGM(proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - data_type_l, update_layer_l, sumnode_connections, USE_DMA): + data_type_l, data_list, update_layer_l, sumnode_connections, USE_DMA): # Check if GPU is available, else keep fake FP16 cuda_is_on = torch.cuda.is_available() @@ -569,6 +569,18 @@ def GenerateGM(proj_folder_path, project_name, f.write("net = DNN().to(device)\n") f.write("for p in net.parameters():\n") f.write("\tnn.init.normal_(p, mean=0.0, std=1.0)\n") + if (data_list): + f.write("from pathlib import Path\n") + f.write("basedir = Path(__file__).resolve().parent.parent\n") + for layer in range(len(layers_l)): + if data_type_l[layer] == 'FP16': + to = ".to(device).half()" + else: + to = ".to(device)" + f.write(f"net.l{layer}.weight = torch.nn.Parameter(torch.from_numpy(numpy.load(basedir / 'data/l{layer}w.npy')){to}, requires_grad=True)\n") + # TODO: uncomment once biases are implemented in trainlib + # f.write(f"net.l{layer}.bias = torch.nn.Parameter(torch.from_numpy(numpy.load(basedir / 'data/l{layer}b.npy')){to}, requires_grad=True)\n") + f.write("net.zero_grad()\n\n") # Freeze layers excluded from sparse update From 1508ae679d1b2878817654da812b640dea194c6e Mon Sep 17 00:00:00 2001 From: Cristian Cioflan Date: Thu, 20 Jun 2024 20:19:39 +0200 Subject: [PATCH 2/6] PATCH: aligning pseudo-sparse update approaches. --- tools/TrainLib_Deployer/TrainLib_Deployer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/TrainLib_Deployer/TrainLib_Deployer.py b/tools/TrainLib_Deployer/TrainLib_Deployer.py index eacf670b..19511399 100644 --- a/tools/TrainLib_Deployer/TrainLib_Deployer.py +++ b/tools/TrainLib_Deployer/TrainLib_Deployer.py @@ -261,6 +261,8 @@ def get_precision(self): if args.start_at is not None: node_names = [n.name for n in graph.graph.node if n.op_type != 'Constant'] assert args.start_at in node_names, f"{args.start_at} is not a valid layer name. Layer names are: {node_names}" + # CIOFLANC: temporary reconciling pseudo-sparse update implementations + assert update_layer_list.index(1) == node_names.index(args.start_at)] for onnx_node in graph.graph.node: if not found_start: From bd4ad56a5e5116f0bad50b18adfc35bfe25ee455 Mon Sep 17 00:00:00 2001 From: Cristian Cioflan Date: Wed, 3 Jul 2024 18:06:20 +0200 Subject: [PATCH 3/6] Merged upstream fixes, aligned sparse update flavours --- tools/TrainLib_Deployer/TrainLib_Deployer.py | 54 +- .../deployer_utils/DNN_Composer.py | 2 +- .../deployer_utils/deployment_utils.py | 10 +- .../deployment_utils_double_buffer.py | 72 +- .../deployment_utils_single_buffer.py | 971 +++++++----------- 5 files changed, 420 insertions(+), 689 deletions(-) diff --git a/tools/TrainLib_Deployer/TrainLib_Deployer.py b/tools/TrainLib_Deployer/TrainLib_Deployer.py index 19511399..2e630d4a 100644 --- a/tools/TrainLib_Deployer/TrainLib_Deployer.py +++ b/tools/TrainLib_Deployer/TrainLib_Deployer.py @@ -41,6 +41,15 @@ 'SGD' -> Stochastic Gradient Descent """ +import argparse +import onnx +import os + +from onnx import shape_inference, numpy_helper + +import numpy as np + + import deployer_utils.DNN_Reader as reader import deployer_utils.DNN_Composer as composer @@ -53,13 +62,10 @@ description='Generating C code for on-device training') parser = argparse.ArgumentParser() -parser._action_groups.pop() -required = parser.add_argument_group('required arguments') -optional = parser.add_argument_group('optional arguments') -required.add_argument('--project_name', type=str, default="Test_CNN", help='Project name', required=True) -required.add_argument('--project_path', type=str, default="./", help='Project path', required=True) -optional.add_argument('--model_path', type=str, default=None, help='Pretrained model path') -optional.add_argument('--start_at', type=str, default=None, help='At which node to start generating') +parser.add_argument('--project_name', type=str, default="Test_CNN", help='Project name') +parser.add_argument('--project_path', type=str, default="./", help='Project path') +parser.add_argument('--model_path', type=str, default=None, help='Pretrained model path') +parser.add_argument('--start_at', type=str, default=None, help='At which node to start generating') args = parser.parse_args() @@ -73,7 +79,7 @@ batch_size = 1 # BATCHING NOT IMPLEMENTED!! learning_rate = 0.001 optimizer = "SGD" # Name of PyTorch's optimizer -loss_fn = "MSELoss" # Name of PyTorch's loss function +loss_fn = "CrossEntropyLoss" # Name of PyTorch's loss function # ------- NETWORK GRAPH -------- # Manually define the list of the network (each layer in the list has its own properties in the relative index of each list) @@ -106,7 +112,7 @@ # Data layout list (CHW or HWC) data_layout_list = ['CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW', 'CHW'] # TO DO # Sparse Update -update_layer_list = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] # Set to 1 for each layer you want to update, 0 if you want to skip weight update +update_layer_list = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # Set to 1 for each layer you want to update, 0 if you want to skip weight update # ----- END OF NETWORK GRAPH ----- @@ -115,7 +121,7 @@ # EXECUTION PROPERTIES NUM_CORES = 8 L1_SIZE_BYTES = 128*(2**10) -USE_DMA = 'DB' # choose whether to load all structures in L1 ('NO') or in L2 and use Single Buffer mode ('SB') or Double Buffer mode ('DB') +USE_DMA = 'NO' # choose whether to load all structures in L1 ('NO') or in L2 and use Single Buffer mode ('SB') or Double Buffer mode ('DB') # BACKWARD SETTINGS SEPARATE_BACKWARD_STEPS = True # If True, writes separate weight and input gradient in backward step # PROFILING OPTIONS @@ -126,7 +132,7 @@ PRINT_TRAIN_LOSS = True # Set to true if you want to print the train loss for each epoch # OTHER PROPERTIES # Select if to read the network from an external source -READ_MODEL_ARCH = False # NOT IMPLEMENTED!! +READ_MODEL_ARCH = args.model_path is not None # --------------------------- # --- END OF USER SETTING --- @@ -259,14 +265,16 @@ def get_precision(self): found_start = args.start_at is None if args.start_at is not None: - node_names = [n.name for n in graph.graph.node if n.op_type != 'Constant'] + node_names = [n.op_type for n in graph.graph.node if n.op_type != 'Constant'] assert args.start_at in node_names, f"{args.start_at} is not a valid layer name. Layer names are: {node_names}" # CIOFLANC: temporary reconciling pseudo-sparse update implementations - assert update_layer_list.index(1) == node_names.index(args.start_at)] + update_layer_list = [1] * (len(node_names) - node_names.index(args.start_at)) + # update_layer_list[node_names.index(args.start_at)] = 1 for onnx_node in graph.graph.node: + if not found_start: - if onnx_node.name != args.start_at: + if onnx_node.op_type != args.start_at: continue else: found_start = True @@ -306,16 +314,16 @@ def get_precision(self): in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) layer_list.append('AvgPool') - (hk, wk) = graph.get_kernel_size(onnx_node.name) + (hk, wk) = graph.get_kernel_size(onnx_node.op_type) hk_list.append(hk) wk_list.append(wk) (hin, win) = graph.get_activation_size(onnx_node.input[0]) hin_list.append(hin) win_list.append(win) - (hstr, wstr) = graph.get_stride(onnx_node.name) + (hstr, wstr) = graph.get_stride(onnx_node.op_type) h_str_list.append(hstr) w_str_list.append(wstr) - (hpad, wpad) = graph.get_pad(onnx_node.name) + (hpad, wpad) = graph.get_pad(onnx_node.op_type) h_pad_list.append(hpad) w_pad_list.append(wpad) opt_mm_fw_list.append(0) @@ -352,23 +360,23 @@ def get_precision(self): elif onnx_node.op_type == 'Conv': in_ch_list.append(graph.get_channel_count(onnx_node.input[0])) out_ch_list.append(graph.get_channel_count(onnx_node.output[0])) - if graph.is_pointwise(onnx_node.name): + if graph.is_pointwise(onnx_node.op_type): ty = "PW" - elif graph.is_depthwise(onnx_node.name): + elif graph.is_depthwise(onnx_node.op_type): ty = "DW" else: ty = "conv2d" layer_list.append(ty) - (hk, wk) = graph.get_kernel_size(onnx_node.name) + (hk, wk) = graph.get_kernel_size(onnx_node.op_type) hk_list.append(hk) wk_list.append(wk) (hin, win) = graph.get_activation_size(onnx_node.input[0]) hin_list.append(hin) win_list.append(win) - (hstr, wstr) = graph.get_stride(onnx_node.name) + (hstr, wstr) = graph.get_stride(onnx_node.op_type) h_str_list.append(hstr) w_str_list.append(wstr) - (hpad, wpad) = graph.get_pad(onnx_node.name) + (hpad, wpad) = graph.get_pad(onnx_node.op_type) h_pad_list.append(hpad) w_pad_list.append(wpad) opt_mm_fw_list.append(0) @@ -471,4 +479,4 @@ def get_precision(self): print("PULP project generation successful!") - pass + pass \ No newline at end of file diff --git a/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py b/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py index ea36ac7e..1677178f 100644 --- a/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py +++ b/tools/TrainLib_Deployer/deployer_utils/DNN_Composer.py @@ -258,4 +258,4 @@ def DNN_Composer (proj_folder_path, project_name, else: print(f"[DNN_Composer]: Not supported argument for USE_DMA: '{USE_DMA}' given") - return + return \ No newline at end of file diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py index 6ce9a60c..6c8abf21 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py @@ -244,8 +244,8 @@ def InitProject(proj_folder_path): utils_folder = proj_folder + 'utils/' trainlib_dest_folder = proj_folder + 'lib/' - os.mkdir(proj_folder) - os.mkdir(utils_folder) + os.makedirs(proj_folder, exist_ok = True) + os.makedirs(utils_folder, exist_ok = True) shutil.copy2('./deployer_utils/srcfiles/main.c', proj_folder) shutil.copy2('./deployer_utils/srcfiles/stats.h', proj_folder) @@ -283,7 +283,7 @@ def GenerateMakefile(proj_folder_path, project_name, layers_l, NUM_CORES, data_t f.write('MATMUL_TYPE_IG_L'+str(layer)+'?='+str(opt_mm_ig_list[layer])+' # Selects which optimized matmul to be used in IN GRAD (see mm_manager_list.txt or "MM_manager()" body to verify which one is called)' + '\n') f.write('# End of user settings\n\n') - f.write('NUM_MATMULS?=24 # Available standard matmuls in the library' + '\n') + f.write('NUM_MATMULS?=24 # Available standard matmuls in the library' + '\n') f.write('TRAIN_LIB=./lib\n') f.write('TRAIN_LIB_SRCS=$(TRAIN_LIB)/sources\n') f.write('APP_SRCS = main.c net.c\n\n') @@ -384,6 +384,7 @@ def GenerateGM(proj_folder_path, project_name, f.write("import torch.optim as optim\n") f.write("import dump_utils as dump\n") f.write("import math\n") + f.write("import numpy\n") f.write("\n") f.write("# Set device\n") @@ -1738,5 +1739,4 @@ def GenerateNet(proj_folder_path, project_name, f.close() - return - + return \ No newline at end of file diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_double_buffer.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_double_buffer.py index 900549f0..064aec55 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_double_buffer.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_double_buffer.py @@ -78,8 +78,8 @@ def GenerateNet(proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - data_type_l, update_layer_l, sumnode_connections, MAX_LAYER_DIM, - PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS): + data_type_l, sumnode_connections, MAX_LAYER_DIM, + PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS): data_type = data_type_l[0] @@ -153,7 +153,8 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n// Define structures and pointers to data in L1 memory\n") if data_type == 'FP32': #f.write("PI_L1 float * D0, * d0, * W0, * w0, * D1, * d1, * W1, *w1;\n") - f.write("PI_L1 float BUFF[MAX_SIZE];\n") + # f.write("PI_L1 float BUFF[MAX_SIZE];\n") + f.write("PI_L1 float * BUFF;\n") f.write("PI_L1 struct blob d1_blob;\n") f.write("PI_L1 struct blob w1_blob;\n") f.write("PI_L1 struct blob d0_blob;\n") @@ -173,7 +174,8 @@ def GenerateNet(proj_folder_path, project_name, #f.write("PI_L1 float * t;\n") elif data_type == 'FP16': f.write("PI_L1 fp16 * D1, * d1, * W1, * w1, * D0, * d0, * W0, *w0;\n") - f.write("PI_L1 fp16 BUFF[MAX_SIZE];\n") + # f.write("PI_L1 fp16 BUFF[MAX_SIZE];\n") + f.write("PI_L1 fp16 * BUFF;\n") f.write("PI_L1 struct blob_fp16 d1_blob;\n") f.write("PI_L1 struct blob_fp16 w1_blob;\n") f.write("PI_L1 struct blob_fp16 d0_blob;\n") @@ -385,7 +387,7 @@ def GenerateNet(proj_folder_path, project_name, im2col_byte_length = 0 im2col_max_data_type = 'FP32' for layer in range(len(layers_l)): - if layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True: # or layers_l[layer] == 'DW': + if layers_l[layer] == 'conv2d': # or layers_l[layer] == 'DW': if data_type_l[layer] == 'FP32': im2col_byte_length = 4 elif data_type_l[layer] == 'FP16': @@ -423,13 +425,6 @@ def GenerateNet(proj_folder_path, project_name, else: print("[deployment_utils.GenerateNet] Invalid data type for im2col!!") exit() - # No im2col buffer - allocate_no_im2col = False - for layer in range(len(layers_l)): - if layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == False: - allocate_no_im2col = True - if allocate_no_im2col == True: - f.write("PI_L1 float im2col_buffer[1];\n") # Write in grad transposition / blocktranspose buffer bt_flag = False @@ -440,10 +435,10 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): # Check layer data layout data_layout = 'CHW' # Change to input list of data layouts - if ((layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True) or layers_l[layer] == 'PW') and layer == 0: + if (layers_l[layer] == 'conv2d' or layers_l[layer] == 'PW') and layer == 0: bt_flag = True bt_layer_index = 0 - elif ((layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True) or layers_l[layer] == 'PW') and layer > 0: + elif (layers_l[layer] == 'conv2d' or layers_l[layer] == 'PW') and layer > 0: bt_flag = True bt_mem = in_ch_l[layer] * hk_l[layer] * wk_l[layer] * out_ch_l[layer] if bt_mem > bt_max_memocc: @@ -484,13 +479,7 @@ def GenerateNet(proj_folder_path, project_name, else: print("[deployment_utils.GenerateNet] Invalid data type for pw transp buffer definition!\n") exit() - # No blocktranspose buffer - if (bt_flag == False): - print("No blockstranspose buffer detected\n") - f.write("PI_L1 float bt_buffer[1];\n") - # Define label buffer - f.write("PI_L1 float label_temp[Tout_C_l"+str(len(layers_l)-1)+"*Tout_H_l"+str(len(layers_l)-1)+"*Tout_W_l"+str(len(layers_l)-1)+"];\n") # Define tensors to backpropagate the output error f.write("\n// Define error propagation tensors\n") @@ -580,6 +569,10 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n// DNN initialization function\n") f.write("void DNN_init()\n{\n") f.write("\n// Assign pointers in L1\n") + if (data_type == "FP32"): + f.write(" BUFF = (float *) pi_l1_malloc(NULL, MAX_SIZE*sizeof(float));\n") + elif (data_type == "FP16"): + f.write(" BUFF = (fp16 *) pi_l1_malloc(NULL, MAX_SIZE*sizeof(float));\n") f.write(" d0_blob.data = BUFF;\n") f.write(" d0_blob.diff = BUFF;\n") f.write(" w0_blob.data = BUFF;\n") @@ -824,16 +817,13 @@ def GenerateNet(proj_folder_path, project_name, skip_inputgrad = 0 # Write configuration templates if layers_l[layer] == 'linear': - f.write(ntemp.linear_config_template(layer, skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.linear_config_template(layer, skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'conv2d': - IM2COL_USEIT = 1 - if CONV2D_USE_IM2COL == False: - IM2COL_USEIT = 0 - f.write(ntemp.conv2d_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer], IM2COL_USEIT, 1)) + f.write(ntemp.conv2d_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'PW': - f.write(ntemp.PW_config_template(layer, skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.PW_config_template(layer, skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'DW': - f.write(ntemp.DW_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.DW_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'ReLU': f.write(ntemp.ReLU_config_template(layer, data_type_l[layer])) elif layers_l[layer] == 'MaxPool': @@ -1246,25 +1236,7 @@ def GenerateNet(proj_folder_path, project_name, elif data_type_l[-1] == 'FP16': f.write(" pulp_MSELoss_fp16(&loss_args);\n") else: - print("[deployment_utils.GenerateNet]: Invalid loss type!") - exit() - f.write(f"\tstore((uint32_t) out.diff, (uint32_t) layer{len(layers_l)-1}_out.diff, {float_size}*OUT_SIZE);\n") - elif loss_fn == "CrossEntropyLoss": - float_size = 2 - if data_type_l[0] == 'FP32': - float_size = 4 - f.write("\tloss_args.output = &out;\n") - f.write("\tloss_args.target = out.diff;\n") - f.write("\tloss_args.wr_loss = &loss;\n") - f.write(f"\tload((uint32_t) LABEL, (uint32_t) out.diff, {float_size}*OUT_SIZE);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load);\n") - - if data_type_l[-1] == 'FP32': - f.write(" pulp_CrossEntropyLoss(&loss_args);\n") - elif data_type_l[-1] == 'FP16': - f.write(" pulp_CrossEntropyLoss_fp16(&loss_args);\n") - else: - print("[deployment_utils.GenerateNet]: Invalid loss type!") + print("[deplyment_utils.GenerateNet]: Invalid loss type!") exit() f.write(f"\tstore((uint32_t) out.diff, (uint32_t) layer{len(layers_l)-1}_out.diff, {float_size}*OUT_SIZE);\n") else: @@ -1438,11 +1410,6 @@ def GenerateNet(proj_folder_path, project_name, f.write(" for (int epoch=0; epoch>> EPOCH %d: train_loss = %f (GM: %f)\\n\", epoch, loss, TRAIN_LOSS[epoch]);\n") - f.write(" /* Continue profiling */ pi_perf_start();\n") f.write(" backward();\n") f.write(" update_weights();\n") f.write(" }\n\n") @@ -1459,6 +1426,9 @@ def GenerateNet(proj_folder_path, project_name, f.write(" check_post_training_output();\n") f.write(" print_output();\n") + f.write(" // Free l1 buffer\n") + f.write(" pi_l1_free(NULL, BUFF, MAX_SIZE*sizeof(float));\n") + f.write("}\n") data_size = 0 diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py index e5ee1769..163a601c 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py @@ -30,130 +30,79 @@ """ DNN Size Checker backend functions """ -def max_input_dim(layers_l, cin_l, hin_l, win_l, data_type_l, update_layer_l): - nbytes = 4 - nbytes_max = 4 +def max_input_dim(layers_l, cin_l, hin_l, win_l): RES = 0 for layer in range(len(layers_l)): - # Check data type - if data_type_l[layer] == 'FP32': - nbytes = 4 - elif data_type_l[layer] == 'FP16': - nbytes = 2 - temp = cin_l[layer]*hin_l[layer]*win_l[layer]*nbytes - # Check if the tensor needs to store gradients - if layer > 0 and update_layer_l[layer-1] == 1: - temp = temp * 2 - # Check if maximum is exceeded + temp = cin_l[layer]*hin_l[layer]*win_l[layer] if temp > RES: - nbytes_max = nbytes RES = temp - RES /= nbytes_max - - print(f"[TEST!!! max_input_dim] RES = {RES}") - - # Result returned in number of elements of the largest input tensor return RES -def max_wgt_dim(layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l, data_type_l, update_layer_l): - nbytes = 4 - nbytes_max = 4 +def max_wgt_dim(layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l): RES = 0 temp = 0 for layer in range(len(layers_l)): - # Check data type - if data_type_l[layer] == 'FP32': - nbytes = 4 - elif data_type_l[layer] == 'FP16': - nbytes = 2 - # Define size depending on layer type if layers_l[layer] == 'conv2d' : - temp = hk_l[layer]*wk_l[layer]*cin_l[layer]*cout_l[layer]*nbytes + temp = hk_l[layer]*wk_l[layer]*cin_l[layer]*cout_l[layer] if layers_l[layer] == 'PW': - temp = cin_l[layer]*cout_l[layer]*nbytes + temp = cin_l[layer]*cout_l[layer] if layers_l[layer] == 'DW': - temp = hk_l[layer]*wk_l[layer]*cin_l[layer]*nbytes + temp = hk_l[layer]*wk_l[layer]*cin_l[layer] if layers_l[layer] == 'linear' : - temp = cin_l[layer]*cout_l[layer]*nbytes + temp = cin_l[layer]*cout_l[layer] if layers_l[layer] == 'Sumnode': - temp = cin_l[layer]*hin_l[layer]*win_l[layer]*nbytes - # Check if tensor needs to store gradients - if update_layer_l[layer] == 1: - temp = temp * 2 - # Check if maximum is exceeded + temp = cin_l[layer]*hin_l[layer]*win_l[layer] if temp > RES: - nbytes_max = nbytes RES = temp - RES /= nbytes_max - - print(f"[TEST!!! max_wgt_dim] RES = {RES}") - - # Result returned in number of elements of the largest weight tensor return RES -def max_layer_dim (layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l, data, h_str, w_str, h_pad, w_pad, data_type_l, update_layer_l): - nbytes = 4 - nbytes_max = 4 +def max_layer_dim (layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l, data, h_str, w_str, h_pad, w_pad): RES = 0 - tmp_inp = 0 #input - tmp_wgt = 0 #wgt - tmp_out = 0 #output + temp1 = 0 #input + temp2 = 0 #wgt + temp3 = 0 #output tot = 0 max_layer = 0 for layer in range(len(layers_l)): - # Check data type - if data_type_l[layer] == 'FP32': - nbytes = 4 - elif data_type_l[layer] == 'FP16': - nbytes = 2 - # Define size depending on layer if layers_l[layer] == 'conv2d' : - tmp_wgt = hk_l[layer]*wk_l[layer]*cin_l[layer]*cout_l[layer]*nbytes + temp2 = hk_l[layer]*wk_l[layer]*cin_l[layer]*cout_l[layer] if layers_l[layer] == 'PW': - tmp_wgt = cin_l[layer]*cout_l[layer]*nbytes + temp2 = cin_l[layer]*cout_l[layer] if layers_l[layer] == 'DW': - tmp_wgt = hk_l[layer]*wk_l[layer]*cin_l[layer]*nbytes + temp2 = hk_l[layer]*wk_l[layer]*cin_l[layer] if layers_l[layer] == 'linear' : - tmp_wgt = cin_l[layer]*cout_l[layer]*nbytes + temp2 = cin_l[layer]*cout_l[layer] if layers_l[layer] == 'Sumnode': - tmp_wgt = cin_l[layer]*hin_l[layer]*win_l[layer]*nbytes + temp2 = cin_l[layer]*hin_l[layer]*win_l[layer] if layers_l[layer] == 'InstNorm': - tmp_wgt = 2*cin_l[layer]*nbytes + temp2 = 2*cin_l[layer] if layers_l[layer] in ['ReLU', 'Skipnode']: - tmp_wgt = 0 - # Check if tensor needs to store gradients - if update_layer_l[layer] == 1: - tmp_wgt = tmp_wgt * 2 + temp2 = 0 - tmp_inp = cin_l[layer]*hin_l[layer]*win_l[layer]*nbytes - # Check if the tensor needs to store gradients - if layer > 0 and update_layer_l[layer-1] == 1: - tmp_inp = tmp_inp * 2 + temp1 = cin_l[layer]*hin_l[layer]*win_l[layer] hout = int((hin_l[layer] - hk_l[layer] + 2*h_pad[layer])/h_str[layer] + 1) wout = int((win_l[layer] - wk_l[layer] + 2*w_pad[layer])/w_str[layer] + 1) if layers_l[layer] == 'linear': - tmp_out = cout_l[layer]*nbytes + temp3 = cout_l[layer] else: - tmp_out = cout_l[layer] * hout * wout * nbytes - # Check if the tensor needs to store gradients - if update_layer_l[layer] == 1: - tmp_out = tmp_out * 2 + temp3 = cout_l[layer] * hout * wout - tot = tmp_inp + tmp_wgt + tmp_out - print(f"Layer {layer} ({layers_l[layer]}): Input: {tmp_inp}, Coefficients: {tmp_wgt}, Output: {tmp_out}, Total: {tot} (data + gradients)") + tot = temp1 + temp2 + temp3 + print(f"Layer {layer} ({layers_l[layer]}): Input: {temp1}, Coefficients: {temp2}, Output: {temp3}, Total: {tot}") if tot > RES: - nbytes_max = nbytes RES = tot max_layer = layer + multiplier = 2 + if data == 'FP32': + multiplier = 4 + RES = 2*multiplier*RES #The 2 factor accounts for for both data and diff storage print(f"Max Layer size (including data and gradients): {RES} bytes @layer {max_layer}") - - # Result returned in bytes of the largest layer + size in bytes of the largest layer - return RES, nbytes_max + return RES """ @@ -166,8 +115,8 @@ def GenerateNet(proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - data_type_l, update_layer_l, sumnode_connections, MAX_LAYER_DIM, - PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS): + data_type_l, sumnode_connections, MAX_LAYER_DIM, + PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS): data_type = data_type_l[0] @@ -203,8 +152,6 @@ def GenerateNet(proj_folder_path, project_name, f.write("void check_post_training_output();\n") f.write("\n// DMA managment functions\n") - f.write("void set_buffer_pointers(void * blob_in, void * blob_wgt, void * blob_out, int compute_in_grad);\n") - f.write("void set_buffer_pointers_fp16(void * blob_in, void * blob_wgt, void * blob_out, int compute_in_grad);\n") f.write("void load_input(void * src_blob, uint8_t data_diff_both);\n") f.write("void load_output(void * src_blob, uint8_t data_diff_both);\n") f.write("void load_coeff(void * src_blob, uint8_t data_diff_both);\n") @@ -219,20 +166,9 @@ def GenerateNet(proj_folder_path, project_name, f.write("void update_blob();\n") f.write("void reset_dim();\n") - f.write(f"\n// Max tensor and layer sizes\n") - f.write(f"#define MAX_IN_SIZE {int(max_input_dim(layers_l, in_ch_l, hin_l, win_l, data_type_l, update_layer_l))}\n") - f.write(f"#define MAX_WGT_SIZE {int(max_wgt_dim(layers_l, in_ch_l, hin_l, win_l, out_ch_l, hk_l, wk_l, data_type_l, update_layer_l))}\n") - f.write(f"#define MAX_SIZE {int(MAX_LAYER_DIM)}\n") - - f.write(f"\n// Single buffering constants\n") - f.write(f"#define SB_DMA_GRAD 0\n") - f.write(f"#define SB_DMA_DATA 1\n") - f.write(f"#define SB_DMA_BOTH 2\n") - - f.write(f"\n// Partial update constants\n") - f.write(f"#define PU_SKIP_IN_GRAD 0\n") - f.write(f"#define PU_COMP_IN_GRAD 1\n") - + f.write(f"#define MAX_IN_SIZE {max_input_dim(layers_l, in_ch_l, hin_l, win_l)}\n") + f.write(f"#define MAX_WGT_SIZE {max_wgt_dim(layers_l, in_ch_l, hin_l, win_l, out_ch_l, hk_l, wk_l)}\n") + f.write(f"#define MAX_SIZE {MAX_LAYER_DIM}\n") f.close() @@ -391,7 +327,7 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): # Define FP32 tensors if data_type_l[layer] == 'FP32': - if layers_l[layer] in ['MaxPool', 'AvgPool', 'ReLU']: + if layers_l[layer] == 'MaxPool' or layers_l[layer] == 'AvgPool': f.write("PI_L2 float l"+str(layer)+"_ker[1];\n") elif layers_l[layer] == 'Skipnode' or layers_l[layer] == 'Sumnode': pass @@ -401,7 +337,7 @@ def GenerateNet(proj_folder_path, project_name, f.write("PI_L2 float l"+str(layer)+"_ker[Tin_C_l"+str(layer)+" * Tout_C_l"+str(layer)+" * Tker_H_l"+str(layer)+" * Tker_W_l"+str(layer)+"];\n") # Define FP16 tensors elif data_type_l[layer] == 'FP16': - if layers_l[layer] in ['MaxPool', 'AvgPool', 'ReLU']: + if layers_l[layer] == 'MaxPool' or layers_l[layer] == 'AvgPool': f.write("PI_L2 fp16 l"+str(layer)+"_ker[1];\n") elif layers_l[layer] == 'Skipnode' or layers_l[layer] == 'Sumnode': pass @@ -418,7 +354,7 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): # Define FP32 tensors if data_type_l[layer] == 'FP32': - if layers_l[layer] in ['MaxPool', 'AvgPool', 'ReLU']: + if layers_l[layer] == 'MaxPool' or layers_l[layer] == 'AvgPool': f.write("PI_L2 float l"+str(layer)+"_ker_diff[1];\n") elif layers_l[layer] == 'Skipnode' or layers_l[layer] == 'Sumnode': pass @@ -428,7 +364,7 @@ def GenerateNet(proj_folder_path, project_name, f.write("PI_L2 float l"+str(layer)+"_ker_diff[Tin_C_l"+str(layer)+" * Tout_C_l"+str(layer)+" * Tker_H_l"+str(layer)+" * Tker_W_l"+str(layer)+"];\n") # Define FP16 tensors elif data_type_l[layer] == 'FP16': - if layers_l[layer] in ['MaxPool', 'AvgPool', 'ReLU']: + if layers_l[layer] == 'MaxPool' or layers_l[layer] == 'AvgPool': f.write("PI_L2 fp16 l"+str(layer)+"_ker_diff[1];\n") elif layers_l[layer] == 'Skipnode' or layers_l[layer] == 'Sumnode': pass @@ -473,7 +409,7 @@ def GenerateNet(proj_folder_path, project_name, im2col_byte_length = 0 im2col_max_data_type = 'FP32' for layer in range(len(layers_l)): - if layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True: # or layers_l[layer] == 'DW': + if layers_l[layer] == 'conv2d': # or layers_l[layer] == 'DW': if data_type_l[layer] == 'FP32': im2col_byte_length = 4 elif data_type_l[layer] == 'FP16': @@ -511,13 +447,6 @@ def GenerateNet(proj_folder_path, project_name, else: print("[deployment_utils.GenerateNet] Invalid data type for im2col!!") exit() - # No im2col buffer - allocate_no_im2col = False - for layer in range(len(layers_l)): - if layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == False: - allocate_no_im2col = True - if allocate_no_im2col == True: - f.write("PI_L1 float im2col_buffer[1];\n") # Write in grad transposition / blocktranspose buffer bt_flag = False @@ -528,10 +457,10 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): # Check layer data layout data_layout = 'CHW' # Change to input list of data layouts - if ((layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True) or layers_l[layer] == 'PW') and layer == 0: + if (layers_l[layer] == 'conv2d' or layers_l[layer] == 'PW') and layer == 0: bt_flag = True bt_layer_index = 0 - elif ((layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True) or layers_l[layer] == 'PW') and layer > 0: + elif (layers_l[layer] == 'conv2d' or layers_l[layer] == 'PW') and layer > 0: bt_flag = True bt_mem = in_ch_l[layer] * hk_l[layer] * wk_l[layer] * out_ch_l[layer] if bt_mem > bt_max_memocc: @@ -572,16 +501,7 @@ def GenerateNet(proj_folder_path, project_name, else: print("[deployment_utils.GenerateNet] Invalid data type for pw transp buffer definition!\n") exit() - # No blocktranspose buffer - if (bt_flag == False): - print("No blockstranspose buffer detected.\n") - f.write("PI_L1 float bt_buffer[1];\n") - # Define label buffer - if data_type_l[-1] == 'FP32': - f.write("PI_L1 float label_temp[Tout_C_l"+str(len(layers_l)-1)+"*Tout_H_l"+str(len(layers_l)-1)+"*Tout_W_l"+str(len(layers_l)-1)+"];\n") - elif data_type_l[-1] == 'FP16': - f.write("PI_L1 fp16 label_temp[Tout_C_l"+str(len(layers_l)-1)+"*Tout_H_l"+str(len(layers_l)-1)+"*Tout_W_l"+str(len(layers_l)-1)+"];\n") # Define tensors to backpropagate the output error f.write("\n// Define error propagation tensors\n") @@ -604,48 +524,7 @@ def GenerateNet(proj_folder_path, project_name, exit() - # Normalization layer running stats - f.write("\n// Define running parameters for normalization layers (L2)\n") - for layer in range(len(layers_l)): - if layers_l[layer] == 'InstNorm': - if data_type_l[layer] == 'FP32': - f.write("PI_L2 float l"+str(layer)+"_running_mean[Tin_C_l"+str(layer)+"];\n") - f.write("PI_L2 float l"+str(layer)+"_running_var[Tin_C_l"+str(layer)+"];\n") - f.write("PI_L2 float l"+str(layer)+"_running_stdev[Tin_C_l"+str(layer)+"];\n") - elif data_type_l[layer] == 'FP16': - f.write("PI_L2 fp16 l"+str(layer)+"_running_mean[Tin_C_l"+str(layer)+"];\n") - f.write("PI_L2 fp16 l"+str(layer)+"_running_var[Tin_C_l"+str(layer)+"];\n") - f.write("PI_L2 fp16 l"+str(layer)+"_running_stdev[Tin_C_l"+str(layer)+"];\n") - # Define L1 buffer for Norm parameters - f.write("\n// L1 buffers for normalization layers\n") - norm_temp_buffer_present = False - max_size = 0 - max_num_bytes = 4 - for layer in range(len(layers_l)): - temp_size = 0 - if layers_l[layer] == 'InstNorm': - num_bytes = 4 - if data_type_l[layer] == 'FP16': - num_bytes = 2 - temp_size = in_ch_l[layer] * num_bytes # 3 * in_ch_l[layer] * num_bytes - if max_size < temp_size: - max_size = temp_size - max_num_bytes = num_bytes - norm_temp_buffer_present = True - if norm_temp_buffer_present: - if max_num_bytes == 4: - f.write("PI_L1 float running_mean_buffer["+str(int(max_size / max_num_bytes))+"];\n") - f.write("PI_L1 float running_var_buffer["+str(int(max_size / max_num_bytes))+"];\n") - f.write("PI_L1 float running_stdev_buffer["+str(int(max_size / max_num_bytes))+"];\n") - elif max_num_bytes == 2: - f.write("PI_L1 fp16 running_mean_buffer["+str(int(max_size / max_num_bytes))+"];\n") - f.write("PI_L1 fp16 running_var_buffer["+str(int(max_size / max_num_bytes))+"];\n") - f.write("PI_L1 fp16 running_stdev_buffer["+str(int(max_size / max_num_bytes))+"];\n") - else: - print("[deployment_utils_single_buffer.py/GenerateNet] Invalid data type for running stats!!") - exit() - - + # Define buffer for mixed precision propagation previous_type = data_type_l[0] is_mixed_precision = False @@ -712,54 +591,48 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n// DNN initialization function\n") f.write("void DNN_init()\n{\n") f.write("\n// Assign pointers in L1\n") - f.write("\tIN_DATA = BUFF;\n") - f.write("\tIN_DIFF = BUFF;\n") - f.write("\tW_DATA = BUFF;\n") - f.write("\tW_DIFF = BUFF;\n") - f.write("\tOUT_DATA = BUFF;\n") - f.write("\tOUT_DIFF = BUFF;\n") - f.write("\tupdate_blob();\n") - f.write("\treset_arguments();\n\n") + f.write("IN_DATA = BUFF;\n") + f.write("IN_DIFF = BUFF;\n") + f.write("W_DATA = BUFF;\n") + f.write("W_DIFF = BUFF;\n") + f.write("OUT_DATA = BUFF;\n") + f.write("OUT_DIFF = BUFF;\n") + f.write("update_blob();\n") + f.write("reset_arguments();\n\n") for layer in range(len(layers_l)): if layer == 0: - f.write("\t// Layer "+str(layer)+"\n") - f.write("\tfor(int i=0; i 0 and layer < len(layers_l)-1: - f.write("\t// Layer "+str(layer)+"\n") + f.write(" // Layer "+str(layer)+"\n") if layers_l[layer] == 'DW': - f.write("\tfor(int i=0; i 0 and layer < len(layers_l)-1: # Hidden layers - f.write("\t// Layer "+str(layer)+"\n") - f.write("\tlayer"+str(layer)+"_in.data = l"+str(layer - previous_was_skip_data)+"_in;\n") - if layers_l[layer] != 'Skipnode': - if (layer - previous_was_skip) > 0: # Avoid assignement of l0_in_diff - f.write("\tlayer"+str(layer)+"_in.diff = l"+str(layer)+"_in_diff;\n") - else: - f.write(f"\tlayer{layer}_in.diff = l{sumnode_connections[layer]}_in_diff;\n") - f.write("\tlayer"+str(layer)+"_in.dim = Tin_C_l"+str(layer)+"*Tin_H_l"+str(layer)+"*Tin_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_in.C = Tin_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_in.H = Tin_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_in.W = Tin_W_l"+str(layer)+";\n") - elif layer == len(layers_l)-1: # Last layer - f.write("\t// Layer "+str(layer)+"\n") - f.write("\tlayer"+str(layer)+"_in.data = l"+str(layer - previous_was_skip_data)+"_in;\n") - f.write("\tlayer"+str(layer)+"_in.diff = l"+str(layer + lookahead)+"_in_diff;\n") - f.write("\tlayer"+str(layer)+"_in.dim = Tin_C_l"+str(layer)+"*Tin_H_l"+str(layer)+"*Tin_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_in.C = Tin_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_in.H = Tin_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_in.W = Tin_W_l"+str(layer)+";\n") - else: - print("[deployment_utils.GenerateNet]: Error in PULP layer initialization!") - exit() - - # WEIGHT BLOB - if len(layers_l) == 1: # DNN is 1 layer long - f.write("\tlayer"+str(layer)+"_wgt.data = l0_ker;\n") - f.write("\tlayer"+str(layer)+"_wgt.diff = l0_ker_diff;\n") + # DNN is 1 layer long + if len(layers_l) == 1: + f.write(" layer"+str(layer)+"_in.data = l0_in;\n") + f.write(" layer"+str(layer)+"_in.dim = Tin_C_l0*Tin_H_l0*Tin_W_l0;\n") + f.write(" layer"+str(layer)+"_in.C = Tin_C_l0;\n") + f.write(" layer"+str(layer)+"_in.H = Tin_H_l0;\n") + f.write(" layer"+str(layer)+"_in.W = Tin_W_l0;\n") + f.write(" layer"+str(layer)+"_wgt.data = l0_ker;\n") + f.write(" layer"+str(layer)+"_wgt.diff = l0_ker_diff;\n") if layers_l[layer] == 'DW': - f.write("\tlayer"+str(layer)+"_wgt.dim = Tin_C_l0*Tker_H_l0*Tker_W_l0;\n") + f.write(" layer"+str(layer)+"_wgt.dim = Tin_C_l0*Tker_H_l0*Tker_W_l0;\n") elif layers_l[layer] == 'InstNorm': - f.write("\tlayer"+str(layer)+"_wgt.dim = 2*Tin_C_l0;\n") - elif layers_l[layer] == 'ReLU': - f.write("\tlayer"+str(layer)+"_wgt.dim = 0;\n") + f.write(" layer"+str(layer)+"_wgt.dim = 2*Tin_C_l0;\n") else: - f.write("\tlayer"+str(layer)+"_wgt.dim = Tin_C_l0*Tout_C_l0*Tker_H_l0*Tker_W_l0;\n") - f.write("\tlayer"+str(layer)+"_wgt.C = Tin_C_l0;\n") - f.write("\tlayer"+str(layer)+"_wgt.H = Tker_H_l0;\n") - f.write("\tlayer"+str(layer)+"_wgt.W = Tker_W_l0;\n") - elif layer == 0: # First layer + f.write(" layer"+str(layer)+"_wgt.dim = Tin_C_l0*Tout_C_l0*Tker_H_l0*Tker_W_l0;\n") + f.write(" layer"+str(layer)+"_wgt.C = Tin_C_l0;\n") + f.write(" layer"+str(layer)+"_wgt.H = Tker_H_l0;\n") + f.write(" layer"+str(layer)+"_wgt.W = Tker_W_l0;\n") + f.write(" layer"+str(layer)+"_out.data = l0_out;\n") + f.write(" layer"+str(layer)+"_out.diff = l0_out_diff;\n") + f.write(" layer"+str(layer)+"_out.dim = Tout_C_l0*Tout_H_l0*Tout_W_l0;\n") + f.write(" layer"+str(layer)+"_out.C = Tout_C_l0;\n") + f.write(" layer"+str(layer)+"_out.H = Tout_H_l0;\n") + f.write(" layer"+str(layer)+"_out.W = Tout_W_l0;\n") + # First layer connection + elif layer == 0: + f.write(" // Layer "+str(layer)+"\n") if layers_l[0] != 'Skipnode': # Avoid weight assignment for Skip Connections - f.write("\tlayer"+str(layer)+"_wgt.data = l"+str(layer)+"_ker;\n") - f.write("\tlayer"+str(layer)+"_wgt.diff = l"+str(layer)+"_ker_diff;\n") + f.write(" layer"+str(layer)+"_in.data = l"+str(layer)+"_in;\n") + f.write(" layer"+str(layer)+"_in.dim = Tin_C_l"+str(layer)+"*Tin_H_l"+str(layer)+"*Tin_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.C = Tin_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.H = Tin_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.W = Tin_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.data = l"+str(layer)+"_ker;\n") + f.write(" layer"+str(layer)+"_wgt.diff = l"+str(layer)+"_ker_diff;\n") if layers_l[layer] == 'DW': - f.write("\tlayer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") elif layers_l[layer] == 'InstNorm': - f.write("\tlayer"+str(layer)+f"_wgt.dim = 2*Tin_C_l{layer};\n") - elif layers_l[layer] == 'ReLU': - f.write("\tlayer"+str(layer)+"_wgt.dim = 0;\n") + f.write(" layer"+str(layer)+f"_wgt.dim = 2*Tin_C_l{layer};\n") else: - f.write("\tlayer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tout_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.C = Tin_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.H = Tker_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.W = Tker_W_l"+str(layer)+";\n") - elif layer > 0 and layer < len(layers_l)-1: # Hidden layers - if layers_l[layer] != 'Skipnode': # Avoid weight assignment for Skipnodes and out data assignement - if layers_l[layer] != 'Sumnode': # Different weight assignement for Sumnodes - f.write("\tlayer"+str(layer)+"_wgt.data = l"+str(layer)+"_ker;\n") - f.write("\tlayer"+str(layer)+"_wgt.diff = l"+str(layer)+"_ker_diff;\n") - if layers_l[layer] == 'DW': - f.write("\tlayer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") - elif layers_l[layer] == 'InstNorm': - f.write("\tlayer"+str(layer)+f"_wgt.dim = 2*Tin_C_l{layer};\n") - elif layers_l[layer] == 'ReLU': - f.write("\tlayer"+str(layer)+"_wgt.dim = 0;\n") - else: - f.write("\tlayer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tout_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.C = Tin_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.H = Tker_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.W = Tker_W_l"+str(layer)+";\n") - else: - f.write("\tlayer"+str(layer)+"_wgt.data = layer"+str(sumnode_connections[layer])+"_out.data;\n") - f.write("\tlayer"+str(layer)+"_wgt.diff = layer"+str(sumnode_connections[layer])+"_out.diff;\n") - f.write("\tlayer"+str(layer)+"_wgt.C = layer"+str(sumnode_connections[layer])+"_out.C;\n") - f.write("\tlayer"+str(layer)+"_wgt.H = layer"+str(sumnode_connections[layer])+"_out.H;\n") - f.write("\tlayer"+str(layer)+"_wgt.W = layer"+str(sumnode_connections[layer])+"_out.W;\n") - f.write("\tlayer"+str(layer)+"_wgt.dim = layer"+str(sumnode_connections[layer])+"_out.C*layer"+str(sumnode_connections[layer])+"_out.H*layer"+str(sumnode_connections[layer])+"_out.W;\n") - elif layer == len(layers_l)-1: # Last layer - if layers_l[layer] != 'Sumnode': - f.write("\tlayer"+str(layer)+"_wgt.data = l"+str(layer)+"_ker;\n") - f.write("\tlayer"+str(layer)+"_wgt.diff = l"+str(layer)+"_ker_diff;\n") - if layers_l[layer] == 'DW': - f.write("\tlayer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") - elif layers_l[layer] == 'InstNorm': - f.write("\tlayer"+str(layer)+f"_wgt.dim = 2*Tin_C_l{layer};\n") - elif layers_l[layer] == 'ReLU': - f.write("\tlayer"+str(layer)+"_wgt.dim = 0;\n") - else: - f.write("\tlayer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tout_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.C = Tin_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.H = Tker_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_wgt.W = Tker_W_l"+str(layer)+";\n") - else: - f.write("\tlayer"+str(layer)+"_wgt.data = layer"+str(sumnode_connections[layer])+"_out.data;\n") - f.write("\tlayer"+str(layer)+"_wgt.diff = layer"+str(sumnode_connections[layer])+"_out.diff;\n") - f.write("\tlayer"+str(layer)+"_wgt.C = layer"+str(sumnode_connections[layer])+"_out.C;\n") - f.write("\tlayer"+str(layer)+"_wgt.H = layer"+str(sumnode_connections[layer])+"_out.H;\n") - f.write("\tlayer"+str(layer)+"_wgt.W = layer"+str(sumnode_connections[layer])+"_out.W;\n") - f.write("\tlayer"+str(layer)+"_wgt.dim = layer"+str(sumnode_connections[layer])+"_out.C*layer"+str(sumnode_connections[layer])+"_out.H*layer"+str(sumnode_connections[layer])+"_out.W;\n") - else: - print("[deployment_utils.GenerateNet]: Error in PULP layer initialization!") - exit() - - # OUTPUT BLOB - if len(layers_l) == 1: # DNN is 1 layer long - f.write("\tlayer"+str(layer)+"_out.data = l0_out;\n") - f.write("\tlayer"+str(layer)+"_out.diff = l0_out_diff;\n") - f.write("\tlayer"+str(layer)+"_out.dim = Tout_C_l0*Tout_H_l0*Tout_W_l0;\n") - f.write("\tlayer"+str(layer)+"_out.C = Tout_C_l0;\n") - f.write("\tlayer"+str(layer)+"_out.H = Tout_H_l0;\n") - f.write("\tlayer"+str(layer)+"_out.W = Tout_W_l0;\n") - elif layer == 0: # First layer - if layers_l[0] != 'Skipnode': + f.write(" layer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tout_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.C = Tin_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.H = Tker_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.W = Tker_W_l"+str(layer)+";\n") # Assign to cast_buffer in case data type changes if data_type_l[layer] != data_type_l[layer+1]: - f.write("\tlayer"+str(layer)+"_out.data = ("+C_data_type+"*) cast_buffer;\n") - f.write("\tlayer"+str(layer)+"_out.diff = ("+C_data_type+"*) cast_buffer;\n") + f.write(" layer"+str(layer)+"_out.data = ("+C_data_type+"*) cast_buffer;\n") + f.write(" layer"+str(layer)+"_out.diff = ("+C_data_type+"*) cast_buffer;\n") else: - f.write("\tlayer"+str(layer)+"_out.data = l"+str(layer+1)+"_in;\n") + f.write(" layer"+str(layer)+"_out.data = l"+str(layer+1)+"_in;\n") if sumnode_connections[layer] < 0 or layers_l[layer] == 'Sumnode': - f.write("\tlayer"+str(layer)+"_out.diff = l"+str(layer + 1 + lookahead)+"_in_diff;\n") + f.write(" layer"+str(layer)+"_out.diff = l"+str(layer + 1 + lookahead)+"_in_diff;\n") else: - #f.write("\tlayer"+str(layer)+"_out.diff = l"+str(layer+1)+"_in_diff;\n") - f.write("\tlayer"+str(layer)+"_out.diff = l"+str(sumnode_connections[layer])+"_in_diff;\n") + f.write(" layer"+str(layer)+"_out.diff = l"+str(sumnode_connections[layer])+"_in_diff;\n") # End of assignment - f.write("\tlayer"+str(layer)+"_out.dim = Tout_C_l"+str(layer)+"*Tout_H_l"+str(layer)+"*Tout_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.C = Tout_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.H = Tout_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.W = Tout_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.dim = Tout_C_l"+str(layer)+"*Tout_H_l"+str(layer)+"*Tout_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.C = Tout_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.H = Tout_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.W = Tout_W_l"+str(layer)+";\n") + else: + f.write(" layer"+str(layer)+"_in.data = l"+str(layer)+"_in;\n") + f.write(" layer"+str(layer)+"_in.dim = Tin_C_l"+str(layer)+"*Tin_H_l"+str(layer)+"*Tin_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.C = Tin_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.H = Tin_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.W = Tin_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.data = l"+str(layer)+"_in;\n") + f.write(" layer"+str(layer)+"_out.diff = l"+str(sumnode_connections[layer])+"_in_diff;\n") + f.write(" layer"+str(layer)+"_out.dim = Tout_C_l"+str(layer)+"*Tout_H_l"+str(layer)+"*Tout_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.C = Tout_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.H = Tout_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.W = Tout_W_l"+str(layer)+";\n") + # Hidden layers + elif layer > 0 and layer < len(layers_l)-1: + f.write(" // Layer "+str(layer)+"\n") + f.write(" layer"+str(layer)+"_in.data = l"+str(layer - previous_was_skip)+"_in;\n") + if layers_l[layer] != 'Skipnode': + if (layer - previous_was_skip) > 0: # Avoid assignement of l0_in_diff + f.write(" layer"+str(layer)+"_in.diff = l"+str(layer)+"_in_diff;\n") else: - f.write("\tlayer"+str(layer)+"_out.data = l"+str(layer)+"_in;\n") - f.write("\tlayer"+str(layer)+"_out.diff = l"+str(sumnode_connections[layer])+"_in_diff;\n") - f.write("\tlayer"+str(layer)+"_out.dim = Tout_C_l"+str(layer)+"*Tout_H_l"+str(layer)+"*Tout_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.C = Tout_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.H = Tout_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.W = Tout_W_l"+str(layer)+";\n") - elif layer > 0 and layer < len(layers_l)-1: # Hidden layers + f.write(f"\tlayer{layer}_in.diff = l{sumnode_connections[layer]}_in_diff;\n") + f.write(" layer"+str(layer)+"_in.dim = Tin_C_l"+str(layer)+"*Tin_H_l"+str(layer)+"*Tin_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.C = Tin_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.H = Tin_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.W = Tin_W_l"+str(layer)+";\n") if layers_l[layer] != 'Skipnode': # Avoid weight assignment for Skipnodes and out data assignement + if layers_l[layer] != 'Sumnode': # Different weight assignement for Sumnodes + f.write(" layer"+str(layer)+"_wgt.data = l"+str(layer)+"_ker;\n") + f.write(" layer"+str(layer)+"_wgt.diff = l"+str(layer)+"_ker_diff;\n") + if layers_l[layer] == 'DW': + f.write(" layer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") + elif layers_l[layer] == 'InstNorm': + f.write(" layer"+str(layer)+f"_wgt.dim = 2*Tin_C_l{layer};\n") + else: + f.write(" layer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tout_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.C = Tin_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.H = Tker_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.W = Tker_W_l"+str(layer)+";\n") + else: + f.write(" layer"+str(layer)+"_wgt.data = layer"+str(sumnode_connections[layer])+"_out.data;\n") + f.write(" layer"+str(layer)+"_wgt.diff = layer"+str(sumnode_connections[layer])+"_out.diff;\n") + f.write(" layer"+str(layer)+"_wgt.C = layer"+str(sumnode_connections[layer])+"_out.C;\n") + f.write(" layer"+str(layer)+"_wgt.H = layer"+str(sumnode_connections[layer])+"_out.H;\n") + f.write(" layer"+str(layer)+"_wgt.W = layer"+str(sumnode_connections[layer])+"_out.W;\n") + f.write(" layer"+str(layer)+"_wgt.dim = layer"+str(sumnode_connections[layer])+"_out.C*layer"+str(sumnode_connections[layer])+"_out.H*layer"+str(sumnode_connections[layer])+"_out.W;\n") # Assign to cast_buffer in case data type changes if data_type_l[layer] != data_type_l[layer+1]: - f.write("\tlayer"+str(layer)+"_out.data = ("+C_data_type+"*) cast_buffer;\n") - f.write("\tlayer"+str(layer)+"_out.diff = ("+C_data_type+"*) cast_buffer;\n") + f.write(" layer"+str(layer)+"_out.data = ("+C_data_type+"*) cast_buffer;\n") + f.write(" layer"+str(layer)+"_out.diff = ("+C_data_type+"*) cast_buffer;\n") else: - f.write("\tlayer"+str(layer)+"_out.data = l"+str(layer+1)+"_in;\n") + f.write(" layer"+str(layer)+"_out.data = l"+str(layer+1)+"_in;\n") if sumnode_connections[layer] == -1 or layers_l[layer] == 'Sumnode': - f.write("\tlayer"+str(layer)+"_out.diff = l"+str(layer+1+lookahead)+"_in_diff;\n") - else: - #f.write("\tlayer"+str(layer)+"_out.diff = l"+str(layer+1)+"_in_diff;\n") - f.write("\tlayer"+str(layer)+"_out.diff = l"+str(sumnode_connections[layer])+"_in_diff;\n") + f.write(" layer"+str(layer)+"_out.diff = l"+str(layer+1+lookahead)+"_in_diff;\n") + else: + f.write(" layer"+str(layer)+"_out.diff = l"+str(sumnode_connections[layer])+"_in_diff;\n") # End of assignment - f.write("\tlayer"+str(layer)+"_out.dim = Tout_C_l"+str(layer)+"*Tout_H_l"+str(layer)+"*Tout_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.C = Tout_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.H = Tout_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.W = Tout_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.dim = Tout_C_l"+str(layer)+"*Tout_H_l"+str(layer)+"*Tout_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.C = Tout_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.H = Tout_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.W = Tout_W_l"+str(layer)+";\n") else: f.write(f"\tlayer{layer}_out = layer{layer}_in;\n") - elif layer == len(layers_l)-1: # Last layer - f.write("\tlayer"+str(layer)+"_out.data = l"+str(layer)+"_out;\n") - f.write("\tlayer"+str(layer)+"_out.diff = l"+str(layer)+"_out_diff;\n") - f.write("\tlayer"+str(layer)+"_out.dim = Tout_C_l"+str(layer)+"*Tout_H_l"+str(layer)+"*Tout_W_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.C = Tout_C_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.H = Tout_H_l"+str(layer)+";\n") - f.write("\tlayer"+str(layer)+"_out.W = Tout_W_l"+str(layer)+";\n") + # Last layer + elif layer == len(layers_l)-1: + f.write(" // Layer "+str(layer)+"\n") + f.write(" layer"+str(layer)+"_in.data = l"+str(layer - previous_was_skip)+"_in;\n") + f.write(" layer"+str(layer)+"_in.diff = l"+str(layer + lookahead)+"_in_diff;\n") + f.write(" layer"+str(layer)+"_in.dim = Tin_C_l"+str(layer)+"*Tin_H_l"+str(layer)+"*Tin_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.C = Tin_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.H = Tin_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_in.W = Tin_W_l"+str(layer)+";\n") + if layers_l[layer] != 'Sumnode': + f.write(" layer"+str(layer)+"_wgt.data = l"+str(layer)+"_ker;\n") + f.write(" layer"+str(layer)+"_wgt.diff = l"+str(layer)+"_ker_diff;\n") + if layers_l[layer] == 'DW': + f.write(" layer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") + elif layers_l[layer] == 'InstNorm': + f.write(" layer"+str(layer)+f"_wgt.dim = 2*Tin_C_l{layer};\n") + else: + f.write(" layer"+str(layer)+"_wgt.dim = Tin_C_l"+str(layer)+"*Tout_C_l"+str(layer)+"*Tker_H_l"+str(layer)+"*Tker_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.C = Tin_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.H = Tker_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_wgt.W = Tker_W_l"+str(layer)+";\n") + else: + f.write(" layer"+str(layer)+"_wgt.data = layer"+str(sumnode_connections[layer])+"_out.data;\n") + f.write(" layer"+str(layer)+"_wgt.diff = layer"+str(sumnode_connections[layer])+"_out.diff;\n") + f.write(" layer"+str(layer)+"_wgt.C = layer"+str(sumnode_connections[layer])+"_out.C;\n") + f.write(" layer"+str(layer)+"_wgt.H = layer"+str(sumnode_connections[layer])+"_out.H;\n") + f.write(" layer"+str(layer)+"_wgt.W = layer"+str(sumnode_connections[layer])+"_out.W;\n") + f.write(" layer"+str(layer)+"_wgt.dim = layer"+str(sumnode_connections[layer])+"_out.C*layer"+str(sumnode_connections[layer])+"_out.H*layer"+str(sumnode_connections[layer])+"_out.W;\n") + f.write(" layer"+str(layer)+"_out.data = l"+str(layer)+"_out;\n") + f.write(" layer"+str(layer)+"_out.diff = l"+str(layer)+"_out_diff;\n") + f.write(" layer"+str(layer)+"_out.dim = Tout_C_l"+str(layer)+"*Tout_H_l"+str(layer)+"*Tout_W_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.C = Tout_C_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.H = Tout_H_l"+str(layer)+";\n") + f.write(" layer"+str(layer)+"_out.W = Tout_W_l"+str(layer)+";\n") else: print("[deployment_utils.GenerateNet]: Error in PULP layer initialization!") exit() if sumnode_connections[layer] != -1 and layers_l[layer] != 'Sumnode': - if layers_l[layer] == 'Skipnode': - previous_was_skip_data += 1 - previous_was_skip_diff += 1 - else: - previous_was_skip_diff = 0 + previous_was_skip += 1 + else: - previous_was_skip_data = 0 - previous_was_skip_diff = 0 + previous_was_skip = 0 - - - f.write("\n\t// Configure layer structures\n") + f.write("\n // Configure layer structures\n") first_is_skip = False # Avoid calculation of gradient if the first Layer is a skipnode if sumnode_connections[0] != -1: first_is_skip = True previous_was_skip = 0 for layer in range(len(layers_l)): - f.write("\t// Layer "+str(layer)+"\n") + f.write(" // Layer "+str(layer)+"\n") if layer == 0: skip_inputgrad = 1 elif layer - previous_was_skip <= 0: # If the 0 layer is a Skipnode, then layer1's diff is the input gradient @@ -997,29 +834,26 @@ def GenerateNet(proj_folder_path, project_name, skip_inputgrad = 0 # Write configuration templates if layers_l[layer] == 'linear': - f.write(ntemp.linear_config_template(layer, skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.linear_config_template(layer, skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'conv2d': - IM2COL_USEIT = 1 - if CONV2D_USE_IM2COL == False: - IM2COL_USEIT = 0 - f.write(ntemp.conv2d_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer], IM2COL_USEIT, 1)) + f.write(ntemp.conv2d_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'PW': - f.write(ntemp.PW_config_template(layer, skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.PW_config_template(layer, skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'DW': - f.write(ntemp.DW_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer], 1)) + f.write(ntemp.DW_config_template(layer, h_pad_l[layer], w_pad_l[layer], h_str_l[layer], w_str_l[layer], skip_inputgrad, data_type_l[layer])) elif layers_l[layer] == 'ReLU': f.write(ntemp.ReLU_config_template(layer, data_type_l[layer])) elif layers_l[layer] == 'MaxPool': - f.write("\t// Pooling layer (see next section)\n") + f.write(" // Pooling layer (see next section)\n") elif layers_l[layer] == 'AvgPool': - f.write("\t// Pooling layer (see next section)\n") + f.write(" // Pooling layer (see next section)\n") elif layers_l[layer] == 'Sumnode': f.write(ntemp.resconn_config_template(layer, sumnode_connections[layer], first_is_skip)) first_is_skip = False elif layers_l[layer] == 'Skipnode': pass elif layers_l[layer] == 'InstNorm': - f.write(ntemp.InstNorm_config_template(layer, skip_inputgrad, 1)) + f.write(ntemp.InstNorm_config_template(layer, skip_inputgrad)) else: print("[deployment_utils.GenerateNet] Undefined layer "+str(layer)+" (unable to write configuration structure)!!") if sumnode_connections[layer] != -1 and layers_l[layer] != 'Sumnode': @@ -1032,16 +866,16 @@ def GenerateNet(proj_folder_path, project_name, if (layers_l[layer] == 'AvgPool' or layers_l[layer] == 'MaxPool'): pooling_exist = True if pooling_exist: - f.write("\n\t// Connect blobs to pooling structures\n") + f.write("\n // Connect blobs to pooling structures\n") for layer in range(len(layers_l)): if (layers_l[layer] == 'AvgPool' or layers_l[layer] == 'MaxPool'): - f.write("\t// Layer "+str(layer)+"\n") - f.write("\tl"+str(layer)+"_args.input = &layer"+str(layer)+"_in;\n") - f.write("\tl"+str(layer)+"_args.output = &layer"+str(layer)+"_out;\n") - f.write("\tl"+str(layer)+"_args.Hker = Tker_H_l"+str(layer)+";\n") - f.write("\tl"+str(layer)+"_args.Wker = Tker_W_l"+str(layer)+";\n") - f.write("\tl"+str(layer)+"_args.Hstride = Tstr_H_l"+str(layer)+";\n") - f.write("\tl"+str(layer)+"_args.Wstride = Tstr_W_l"+str(layer)+";\n") + f.write(" // Layer "+str(layer)+"\n") + f.write(" l"+str(layer)+"_args.input = &layer"+str(layer)+"_in;\n") + f.write(" l"+str(layer)+"_args.output = &layer"+str(layer)+"_out;\n") + f.write(" l"+str(layer)+"_args.Hker = Tker_H_l"+str(layer)+";\n") + f.write(" l"+str(layer)+"_args.Wker = Tker_W_l"+str(layer)+";\n") + f.write(" l"+str(layer)+"_args.Hstride = Tstr_H_l"+str(layer)+";\n") + f.write(" l"+str(layer)+"_args.Wstride = Tstr_W_l"+str(layer)+";\n") f.write("}\n\n") @@ -1049,43 +883,30 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n// Forward pass function\n") f.write("void forward()\n{\n") f.write("\treset_dim();\n") - f.write("\tset_buffer_pointers(&layer0_in, &layer0_wgt, &layer0_out, PU_SKIP_IN_GRAD);\n") - f.write("\tload_input(&layer0_in, SB_DMA_DATA);\n") + f.write("\tload_input(&layer0_in, 1);\n") # Profiling options: single layer or all if PROFILE_SINGLE_LAYERS == True: - f.write("\tprintf(\"\\nFORWARD PROFILING:\\n\\n\");\n") + f.write(" printf(\"\\nFORWARD PROFILING:\\n\\n\");\n") previous_was_skip = False for layer in range(len(layers_l)): # Profile layer by layer? if PROFILE_SINGLE_LAYERS == True: - f.write("\tprintf(\"\\nLayer "+str(layer)+"\\n\");\n") - f.write("\t#ifdef PROF_NET\n") - f.write("\tSTART_STATS();\n") - f.write("\t#endif\n") + f.write(" printf(\"\\nLayer "+str(layer)+"\\n\");\n") + f.write(" #ifdef PROF_NET\n") + f.write(" START_STATS();\n") + f.write(" #endif\n") if layer > 0: - f.write("\n\treset_dim();\n") - f.write(f"\tset_buffer_pointers(&layer{layer}_in, &layer{layer}_wgt, &layer{layer}_out, PU_SKIP_IN_GRAD);\n") - f.write(f"\tload_input(&layer{layer}_in, SB_DMA_DATA);\n") + f.write("\treset_dim();\n") + f.write(f"\tload_input(&layer{layer}_in, 1);\n") if layers_l[layer] not in ['Skipnode', 'ReLU']: - f.write(f"\tload_coeff(&layer{layer}_wgt, SB_DMA_DATA);\n") + f.write(f"\tload_coeff(&layer{layer}_wgt, 1);\n") if layers_l[layer] not in ['Sumnode', 'InstNorm']: f.write(f"\tcopy_struct_param((unsigned int) &l{layer}_args, (unsigned int) &{layers_l[layer]}_args, sizeof({layers_l[layer]}_args));\n") - if layers_l[layer] == 'InstNorm': - num_bytes_load = 4 - if data_type_l[layer] == 'FP16': - num_bytes_load = 2 - f.write("\t// Load running stats\n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_mean), (uint32_t) (running_mean_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_EXT2LOC, cmd_load);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load); \n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_var), (uint32_t) (running_var_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_EXT2LOC, cmd_load);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load); \n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_stdev), (uint32_t) (running_stdev_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_EXT2LOC, cmd_load);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load); \n") f.write(f"\tget_output_dim(&layer{layer}_out);\n") # Generate layer template if layers_l[layer] == 'linear': @@ -1112,20 +933,9 @@ def GenerateNet(proj_folder_path, project_name, print("[deployment_utils.GenerateNet]: PULP layer not implemented or wrapped in DNN Deployer!") exit() if layers_l[layer] != 'Skipnode': - f.write(f"\tstore_output(&layer{layer}_out, SB_DMA_DATA);\n") - if layers_l[layer] == 'InstNorm': - num_bytes_load = 4 - if data_type_l[layer] == 'FP16': - num_bytes_load = 2 - f.write("\t// Store computed running stats\n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_mean), (uint32_t) (running_mean_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_LOC2EXT, cmd_store);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_store); \n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_var), (uint32_t) (running_var_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_LOC2EXT, cmd_store);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_store); \n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_stdev), (uint32_t) (running_stdev_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_LOC2EXT, cmd_store);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_store); \n") + f.write(f"\tstore_output(&layer{layer}_out, 1);\n\n") else: - f.write(f"\tstore_input(&layer{layer}_out, SB_DMA_DATA);\n\n") + f.write(f"\tstore_input(&layer{layer}_out, 1);\n\n") # Insert casting operator for data type variation if layer < len(layers_l)-1 and data_type_l[layer] != data_type_l[layer+1]: if data_type_l[layer] == 'FP32' and data_type_l[layer+1] == 'FP16': @@ -1143,9 +953,9 @@ def GenerateNet(proj_folder_path, project_name, # Profile layer by layer? if PROFILE_SINGLE_LAYERS == True: - f.write("\t#ifdef PROF_NET\n") - f.write("\tSTOP_STATS();\n") - f.write("\t#endif\n\n") + f.write(" #ifdef PROF_NET\n") + f.write(" STOP_STATS();\n") + f.write(" #endif\n\n") f.write("}\n") @@ -1159,56 +969,47 @@ def GenerateNet(proj_folder_path, project_name, bytes_per_data = 4 elif data_type_l[-1] == 'FP16': bytes_per_data = 2 - f.write("\treset_dim();\n") - f.write("\tset_buffer_pointers(&layer"+str(len(layers_l)-1)+"_in, &layer"+str(len(layers_l)-1)+"_wgt, &layer"+str(len(layers_l)-1)+"_out, PU_SKIP_IN_GRAD);\n") - f.write("\tload_output(&layer"+str(len(layers_l)-1)+"_out, SB_DMA_DATA);\n") - f.write("\ttemp_blob.data = label_temp;\n") - f.write("\ttemp_blob.dim = output_blob.dim;\n") - f.write("\tcopy_struct_param((uint32_t) LABEL, (uint32_t) temp_blob.data, "+str(bytes_per_data)+"*temp_blob.dim);\n") - f.write("\tloss_args.output = &output_blob;\n") - f.write("\tloss_args.target = temp_blob.data;\n") - f.write("\tloss_args.wr_loss = &loss;\n") + f.write(" load_output(&layer"+str(len(layers_l)-1)+"_out, 1);\n") + f.write(" copy_struct_param((uint32_t) LABEL, (uint32_t) temp_blob.data, "+str(bytes_per_data)+"*output_blob.dim);\n") + f.write(" loss_args.output = &output_blob;\n") + f.write(" loss_args.target = temp_blob.data;\n") + f.write(" loss_args.wr_loss = &loss;\n") if data_type_l[-1] == 'FP32': - f.write("\tpulp_MSELoss_backward(&loss_args);\n") + f.write(" pulp_MSELoss_backward(&loss_args);\n") elif data_type_l[-1] == 'FP16': - f.write("\tpulp_MSELoss_backward_fp16(&loss_args);\n") - f.write("\tstore_output(&layer"+str(len(layers_l)-1)+"_out, SB_DMA_GRAD);\n") + f.write(" pulp_MSELoss_backward_fp16(&loss_args);\n") + f.write(" load_output(&layer"+str(len(layers_l)-1)+"_out, 0);\n") elif loss_fn == 'CrossEntropyLoss': if data_type_l[-1] == 'FP32': bytes_per_data = 4 elif data_type_l[-1] == 'FP16': bytes_per_data = 2 - f.write("\treset_dim();\n") - f.write("\tset_buffer_pointers(&layer"+str(len(layers_l)-1)+"_in, &layer"+str(len(layers_l)-1)+"_wgt, &layer"+str(len(layers_l)-1)+"_out, PU_SKIP_IN_GRAD);\n") - f.write("\tload_output(&layer"+str(len(layers_l)-1)+"_out, SB_DMA_DATA);\n") - f.write("\ttemp_blob.data = label_temp;\n") - f.write("\ttemp_blob.dim = output_blob.dim;\n") - f.write("\tcopy_struct_param((uint32_t) LABEL, (uint32_t) temp_blob.data, "+str(bytes_per_data)+"*temp_blob.dim);\n") - f.write("\tloss_args.output = &output_blob;\n") - f.write("\tloss_args.target = temp_blob.data;\n") - f.write("\tloss_args.wr_loss = &loss;\n") + f.write(" load_output(&layer"+str(len(layers_l)-1)+"_out, 1);\n") + f.write(" copy_struct_param((uint32_t) LABEL, (uint32_t) temp_blob.data, "+str(bytes_per_data)+"*output_blob.dim);\n") + f.write(" loss_args.output = &output_blob;\n") + f.write(" loss_args.target = temp_blob.data;\n") + f.write(" loss_args.wr_loss = &loss;\n") if data_type_l[-1] == 'FP32': - f.write("\tpulp_CrossEntropyLoss_backward(&loss_args);\n") + f.write(" pulp_CrossEntropyLoss_backward(&loss_args);\n") elif data_type_l[-1] == 'FP16': - f.write("\tpulp_CrossEntropyLoss_backward_fp16(&loss_args);\n") - f.write("\tstore_output(&layer"+str(len(layers_l)-1)+"_out, SB_DMA_GRAD);\n") + f.write(" pulp_CrossEntropyLoss_backward_fp16(&loss_args);\n") + f.write(" load_output(&layer"+str(len(layers_l)-1)+"_out, 0);\n") else: print("[deployment_utils.GenerateNet]: invalid loss function for backward!!") # Profiling options: single layer or all if PROFILE_SINGLE_LAYERS == True: - f.write("\tprintf(\"\\nBACKWARD PROFILING:\\n\\n\");\n") - + f.write(" printf(\"\\nBACKWARD PROFILING:\\n\\n\");\n") for layer in range(len(layers_l)): lay = len(layers_l) - layer - 1 # Profile layer by layer? if PROFILE_SINGLE_LAYERS == True: - f.write("\tprintf(\"\\nLayer "+str(lay)+"\\n\");\n") - f.write("\t#ifdef PROF_NET\n") - f.write("\tSTART_STATS();\n") - f.write("\t#endif\n") + f.write(" printf(\"\\nLayer "+str(lay)+"\\n\");\n") + f.write(" #ifdef PROF_NET\n") + f.write(" START_STATS();\n") + f.write(" #endif\n") # Generate backward layer template is_skipderivation = False # Bool for Skipnode and layer after Skipnodes detection @@ -1231,37 +1032,21 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n\treset_dim();\n") - if lay == 0: # Skip in grad for the first layer - f.write("\tset_buffer_pointers(&layer"+str(lay)+"_in, &layer"+str(lay)+"_wgt, &layer"+str(lay)+"_out, PU_SKIP_IN_GRAD);\n") - else: # Else, allocate memory for in grad - f.write("\tset_buffer_pointers(&layer"+str(lay)+"_in, &layer"+str(lay)+"_wgt, &layer"+str(lay)+"_out, PU_COMP_IN_GRAD);\n") if layers_l[lay] != 'Sumnode': if layers_l[lay] == 'Skipnode': - # FIXME: verify if PU_COMP_IN_GRAD is right and if there is necessity to verify partial update here - f.write(f"\tload_input(&layer{target_layer}_in, SB_DMA_GRAD);\n") + f.write(f"\tload_input(&layer{target_layer}_in, 0);\n") else: - f.write(f"\tload_input(&layer{target_layer}_in, SB_DMA_DATA);\n") + f.write(f"\tload_input(&layer{target_layer}_in, 1);\n") if layers_l[lay] != 'Sumnode' and layers_l[lay] != 'Skipnode' and layers_l[lay] != 'ReLU': - f.write(f"\tload_coeff(&layer{lay}_wgt, SB_DMA_DATA);\n") + f.write(f"\tload_coeff(&layer{lay}_wgt, 1);\n") - f.write(f"\tload_output(&layer{lay}_out, SB_DMA_BOTH);\n") + f.write(f"\tload_output(&layer{lay}_out, 2);\n") # Copy struct info if layers_l[lay] != 'Skipnode' and layers_l[lay] != 'Sumnode' and layers_l[lay] != 'ReLU': f.write(f"\tcopy_struct_param((unsigned int) &l{lay}_args, (unsigned int) &{layers_l[lay]}_args, sizeof(l{lay}_args));\n") - if layers_l[layer] == 'InstNorm': - num_bytes_load = 4 - if data_type_l[layer] == 'FP16': - num_bytes_load = 2 - f.write("\t// Load running stats\n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_mean), (uint32_t) (running_mean_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_EXT2LOC, cmd_load);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load); \n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_var), (uint32_t) (running_var_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_EXT2LOC, cmd_load);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load); \n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (l"+str(layer)+"_running_stdev), (uint32_t) (running_stdev_buffer), "+str(num_bytes_load)+"*Tin_C_l"+str(layer)+", PI_CL_DMA_DIR_EXT2LOC, cmd_load);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load); \n") if layers_l[lay] == 'linear': f.write(ntemp.linear_template_BW(lay, data_type_l[lay], SEPARATE_BACKWARD_STEPS, FIRST_LAYER)) @@ -1281,9 +1066,9 @@ def GenerateNet(proj_folder_path, project_name, f.write(ntemp.residualconn_template_sum_BW(sumnode_connections[lay], data_type_l[lay], target_layer)) elif layers_l[lay] == 'Sumnode': #f.write(ntemp.residualconn_template_copy_BW(lay, data_type_l[lay])) - f.write(f"\tstore_output(&layer{lay}_in, SB_DMA_GRAD);\n") + f.write(f"\tstore_output(&layer{lay}_in, 0);\n") elif layers_l[lay] == 'InstNorm': - f.write(ntemp.InstNorm_template_BW(lay, data_type_l[lay], SEPARATE_BACKWARD_STEPS, FIRST_LAYER, update_layer_l[layer])) + f.write(ntemp.InstNorm_template_BW(lay, data_type_l[lay])) else: print("[deployment_utils.GenerateNet]: PULP layer not implemented or wrapped in DNN Deployer!") exit() @@ -1299,21 +1084,21 @@ def GenerateNet(proj_folder_path, project_name, if sumnode_connections[lay] != -1 and layers_l[lay] != 'Sumnode' and layers_l[lay] != 'Skipnode' and skip_in_grad==0: - f.write(f"\tload_output(&layer{target_layer}_in, SB_DMA_GRAD);\n") + f.write(f"\tload_output(&layer{target_layer}_in, 0);\n") f.write(ntemp.sum(lay, data_type_l[lay])) if layers_l[lay] != 'Sumnode' and layers_l[lay] != 'Skipnode' and layers_l[lay] != 'ReLU': - f.write(f"\tstore_coeff(&layer{lay}_wgt, SB_DMA_GRAD);\n") + f.write(f"\tstore_coeff(&layer{lay}_wgt, 0);\n") if lay > 0 and layers_l[lay] != 'Sumnode': - f.write(f"\tstore_input(&layer{target_layer}_in, SB_DMA_GRAD);\n") + f.write(f"\tstore_input(&layer{target_layer}_in, 0);\n") # Profile layer by layer? if PROFILE_SINGLE_LAYERS == True: - f.write("\t#ifdef PROF_NET\n") - f.write("\tSTOP_STATS();\n") - f.write("\t#endif\n\n") + f.write(" #ifdef PROF_NET\n") + f.write(" STOP_STATS();\n") + f.write(" #endif\n\n") f.write("}\n") @@ -1324,40 +1109,38 @@ def GenerateNet(proj_folder_path, project_name, float_size = 2 if data_type_l[0] == 'FP32': float_size = 4 - f.write("\tload_output(&layer"+str(len(layers_l)-1)+"_out, SB_DMA_DATA);\n") - f.write("\tloss_args.output = &output_blob;\n") - f.write("\tloss_args.target = output_blob.diff;\n") - f.write("\tloss_args.wr_loss = &loss;\n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (LABEL), (uint32_t) (output_blob.diff), {float_size}*OUT_SIZE, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load);\n") + f.write(" loss_args.output = &output_blob;\n") + f.write(" loss_args.target = output_blob.diff;\n") + f.write(" loss_args.wr_loss = &loss;\n") + f.write(f" pi_cl_dma_cmd((uint32_t) (LABEL), (uint32_t) (output_blob.diff), {float_size}*OUT_SIZE, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") + f.write(" pi_cl_dma_cmd_wait(cmd_load);\n") if data_type_l[-1] == 'FP32': - f.write("\tpulp_MSELoss(&loss_args);\n") + f.write(" pulp_MSELoss(&loss_args);\n") elif data_type_l[-1] == 'FP16': - f.write("\tpulp_MSELoss_fp16(&loss_args);\n") + f.write(" pulp_MSELoss_fp16(&loss_args);\n") else: - print("[deployment_utils.GenerateNet]: Invalid loss type!") + print("[deplyment_utils.GenerateNet]: Invalid loss type!") exit() - #f.write(f" store_output(&layer{len(layers_l)-1}_out, SB_DMA_BOTH);\n") + f.write(f" store_output(&layer{len(layers_l)-1}_out, 2);\n") elif loss_fn == "CrossEntropyLoss": float_size = 2 if data_type_l[0] == 'FP32': float_size = 4 - f.write("\tload_output(&layer"+str(len(layers_l)-1)+"_out, SB_DMA_DATA);\n") - f.write("\tloss_args.output = &output_blob;\n") - f.write("\tloss_args.target = output_blob.diff;\n") - f.write("\tloss_args.wr_loss = &loss;\n") - f.write(f"\tpi_cl_dma_cmd((uint32_t) (LABEL), (uint32_t) (output_blob.diff), {float_size}*OUT_SIZE, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") - f.write("\tpi_cl_dma_cmd_wait(cmd_load);\n") + f.write(" loss_args.output = &output_blob;\n") + f.write(" loss_args.target = output_blob.diff;\n") + f.write(" loss_args.wr_loss = &loss;\n") + f.write(f" pi_cl_dma_cmd((uint32_t) (LABEL), (uint32_t) (output_blob.diff), {float_size}*OUT_SIZE, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") + f.write(" pi_cl_dma_cmd_wait(cmd_load);\n") if data_type_l[-1] == 'FP32': - f.write("\tpulp_CrossEntropyLoss(&loss_args);\n") + f.write(" pulp_CrossEntropyLoss(&loss_args);\n") elif data_type_l[-1] == 'FP16': - f.write("\tpulp_CrossEntropyLoss_fp16(&loss_args);\n") + f.write(" pulp_CrossEntropyLoss_fp16(&loss_args);\n") else: print("[deplyment_utils.GenerateNet]: Invalid loss type!") exit() - #f.write(f"\tstore_output(&layer{len(layers_l)-1}_out, SB_DMA_BOTH);\n") + f.write(f" store_output(&layer{len(layers_l)-1}_out, 2);\n") else: print("[deployment_utils.GenerateNet]: Loss function not valid for PULP deployment!!") exit() @@ -1371,26 +1154,25 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): if layers_l[layer] in ['linear', 'conv2d', 'DW', 'PW', 'InstNorm']: if data_type_l[layer] == 'FP32': - f.write("\tstruct optim_args opt_l"+str(layer)+";\n") + f.write(" struct optim_args opt_l"+str(layer)+";\n") elif data_type_l[layer] == 'FP16': - f.write("\tstruct optim_args_fp16 opt_l"+str(layer)+";\n") + f.write(" struct optim_args_fp16 opt_l"+str(layer)+";\n") else: print("[deployment_utils.GenerateNet]: Invalid data type for optimizer structure generation @layer{}!".format(layer)) - f.write("\topt_l"+str(layer)+".weights = &weight_blob;\n") - f.write("\topt_l"+str(layer)+".learning_rate = LEARNING_RATE;\n") - f.write("\tset_buffer_pointers(&layer"+str(layer)+"_in, &layer"+str(layer)+"_wgt, &layer"+str(layer)+"_out, PU_SKIP_IN_GRAD);") - f.write(f"\tload_coeff(&layer{layer}_wgt, SB_DMA_BOTH);\n") + f.write(" opt_l"+str(layer)+".weights = &weight_blob;\n") + f.write(" opt_l"+str(layer)+".learning_rate = LEARNING_RATE;\n") + f.write(f" load_coeff(&layer{layer}_wgt, 2);\n") if optimizer == "SGD": if data_type_l[layer] == 'FP32': - f.write("\tpi_cl_team_fork(NUM_CORES, pulp_gradient_descent_fp32, &opt_l"+str(layer)+");\n") + f.write(" pi_cl_team_fork(NUM_CORES, pulp_gradient_descent_fp32, &opt_l"+str(layer)+");\n") elif data_type_l[layer] == 'FP16': - f.write("\tpi_cl_team_fork(NUM_CORES, pulp_gradient_descent_fp16, &opt_l"+str(layer)+");\n") + f.write(" pi_cl_team_fork(NUM_CORES, pulp_gradient_descent_fp16, &opt_l"+str(layer)+");\n") else: print("[deployment_utils.GenerateNet]: Invalid data type for gradient descent @Layer{}!".format(layer)) else: print("[deployment_utils.GenerateNet]: Invalid optimizer for PULP deployment!!") exit() - f.write(f"\tstore_coeff(&layer{layer}_wgt, SB_DMA_DATA);\n\n") + f.write(f" store_coeff(&layer{layer}_wgt, 2);\n\n") f.write("}\n") @@ -1399,14 +1181,14 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n// Function to print FW output\n") f.write("void print_output()\n{\n") output_index = len(layers_l) - 1 - f.write("\tprintf(\"\\nLayer "+str(output_index)+" output:\\n\");\n\n") - f.write("\tfor (int i=0; i 0)\n") - f.write("\t\tprintf(\"\\n*** UPDATED OUTPUT NOT MATCHING GOLDEN MODEL ***\\n\");\n") + f.write(" if (integrity_check > 0)\n") + f.write(" printf(\"\\n*** UPDATED OUTPUT NOT MATCHING GOLDEN MODEL ***\\n\");\n") f.write("}\n") @@ -1434,42 +1216,37 @@ def GenerateNet(proj_folder_path, project_name, f.write("\n// Call for a complete training step\n") f.write("void net_step()\n{\n") - f.write("\tprintf(\"Initializing network..\\n\");\n") - f.write("\tDNN_init();\n") + f.write(" printf(\"Initializing network..\\n\");\n") + f.write(" DNN_init();\n") - f.write("\tprintf(\"Testing DNN initialization forward..\");\n") - f.write("\tforward();\n") - f.write("\tprint_output();\n\n") + f.write(" printf(\"Testing DNN initialization forward..\");\n") + f.write(" forward();\n") + f.write(" print_output();\n\n") # Profile layer by layer? if PROFILE_SINGLE_LAYERS == False: - f.write("\t#ifdef PROF_NET\n") - f.write("\tINIT_STATS();\n PRE_START_STATS();\n START_STATS();\n") - f.write("\t#endif\n\n") - - f.write("\tfor (int epoch=0; epoch>> EPOCH %d: train_loss = %f (GM: %f)\\n\", epoch, loss, TRAIN_LOSS[epoch]);\n") - f.write("\t\t/* Continue profiling */ pi_perf_start();\n") - f.write("\t\tbackward();\n") - f.write("\t\tupdate_weights();\n") - f.write("\t}\n\n") + f.write(" #ifdef PROF_NET\n") + f.write(" INIT_STATS();\n PRE_START_STATS();\n START_STATS();\n") + f.write(" #endif\n\n") + + f.write(" for (int epoch=0; epochdiff), (uint32_t) (W_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") - f.write("\tif (data_diff_both == SB_DMA_DATA) // Load only .data\n") + f.write("\tif (data_diff_both == 1) // Load only .data\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (W_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") - f.write("\tif (data_diff_both == SB_DMA_BOTH) { // Load both .data and .diff\n") + f.write("\tif (data_diff_both > 1) { // Load both .data and .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (W_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") f.write("\tpi_cl_dma_cmd_wait(cmd_load);\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (W_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);"+"}\n") @@ -1500,11 +1277,11 @@ def GenerateNet(proj_folder_path, project_name, f.write("\nvoid load_input(void * src_blob, uint8_t data_diff_both){\n") f.write(f"\tstruct blob{suffix} * b = (struct blob{suffix} *) src_blob;\n") f.write("\tget_input_dim(src_blob);\n") - f.write("\tif (data_diff_both == SB_DMA_GRAD) // Load only .diff\n") + f.write("\tif (data_diff_both == 0) // Load only .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (IN_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") - f.write("\tif (data_diff_both == SB_DMA_DATA) // Load only .data\n") + f.write("\tif (data_diff_both == 1) // Load only .data\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (IN_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") - f.write("\tif (data_diff_both == SB_DMA_BOTH) { // Load both .data and .diff\n") + f.write("\tif (data_diff_both > 1) { // Load both .data and .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (IN_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") f.write("\tpi_cl_dma_cmd_wait(cmd_load);\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (IN_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);"+"}\n") @@ -1513,11 +1290,11 @@ def GenerateNet(proj_folder_path, project_name, f.write("\nvoid load_output(void * src_blob, uint8_t data_diff_both){\n") f.write(f"\tstruct blob{suffix} * b = (struct blob{suffix} *) src_blob;\n") f.write("\tget_output_dim(src_blob);\n") - f.write("\tif (data_diff_both == SB_DMA_GRAD) // Load only .diff\n") + f.write("\tif (data_diff_both == 0) // Load only .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (OUT_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") - f.write("\tif (data_diff_both == SB_DMA_DATA) // Load only .data\n") + f.write("\tif (data_diff_both == 1) // Load only .data\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (OUT_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") - f.write("\tif (data_diff_both == SB_DMA_BOTH) { // Load both .data and .diff\n") + f.write("\tif (data_diff_both > 1) { // Load both .data and .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (OUT_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);\n") f.write("\tpi_cl_dma_cmd_wait(cmd_load);\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (OUT_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_EXT2LOC , cmd_load);"+"}\n") @@ -1525,11 +1302,11 @@ def GenerateNet(proj_folder_path, project_name, f.write("\nvoid store_output(void * dest_blob, uint8_t data_diff_both){ \n") f.write(f"\tstruct blob{suffix} * b = (struct blob{suffix} *) dest_blob;\n") - f.write("\tif (data_diff_both == SB_DMA_GRAD) // Store only .diff\n") + f.write("\tif (data_diff_both == 0) // Store only .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (OUT_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") - f.write("\tif (data_diff_both == SB_DMA_DATA) // Store only .data\n") + f.write("\tif (data_diff_both == 1) // Store only .data\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (OUT_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") - f.write("\tif (data_diff_both == SB_DMA_BOTH) { // Store both .data and .diff\n") + f.write("\tif (data_diff_both > 1) { // Store both .data and .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (OUT_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") f.write("\tpi_cl_dma_cmd_wait(cmd_store);\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (OUT_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);"+"}\n") @@ -1537,11 +1314,11 @@ def GenerateNet(proj_folder_path, project_name, f.write("\nvoid store_coeff(void * dest_blob, uint8_t data_diff_both){ \n") f.write(f"\tstruct blob{suffix} * b = (struct blob{suffix} *) dest_blob;\n") - f.write("\tif (data_diff_both == SB_DMA_GRAD) // Store only .diff\n") + f.write("\tif (data_diff_both == 0) // Store only .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (W_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") - f.write("\tif (data_diff_both == SB_DMA_DATA) // Store only .data\n") + f.write("\tif (data_diff_both == 1) // Store only .data\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (W_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") - f.write("\tif (data_diff_both == SB_DMA_BOTH) { // Store both .data and .diff\n") + f.write("\tif (data_diff_both > 1) { // Store both .data and .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (W_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") f.write("\tpi_cl_dma_cmd_wait(cmd_store);\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (W_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);"+"}\n") @@ -1549,11 +1326,11 @@ def GenerateNet(proj_folder_path, project_name, f.write("\nvoid store_input(void * dest_blob, uint8_t data_diff_both){ \n") f.write(f"\tstruct blob{suffix} * b = (struct blob{suffix} *) dest_blob;\n") - f.write("\tif (data_diff_both == SB_DMA_GRAD) // Store only .diff\n") + f.write("\tif (data_diff_both == 0) // Store only .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (IN_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") - f.write("\tif (data_diff_both == SB_DMA_DATA) // Store only .data\n") + f.write("\tif (data_diff_both == 1) // Store only .data\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (IN_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") - f.write("\tif (data_diff_both == SB_DMA_BOTH) { // Store both .data and .diff\n") + f.write("\tif (data_diff_both > 1) { // Store both .data and .diff\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->data), (uint32_t) (IN_DATA), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);\n") f.write("\tpi_cl_dma_cmd_wait(cmd_store);\n") f.write(f"\tpi_cl_dma_cmd((uint32_t) (b->diff), (uint32_t) (IN_DIFF), {data_size}*b->dim, PI_CL_DMA_DIR_LOC2EXT , cmd_store);"+"}\n") @@ -1564,52 +1341,28 @@ def GenerateNet(proj_folder_path, project_name, f.write("\tinput_blob.C = src->C;\n") f.write("\tinput_blob.H = src->H;\n") f.write("\tinput_blob.W = src->W;\n") - f.write("\tinput_blob.dim = src->dim;}\n") - #f.write("\tIN_DIFF = BUFF + input_blob.dim;\n") - #f.write("\tW_DATA = BUFF + 2*input_blob.dim;\n") - #f.write("\tupdate_blob();}\n") + f.write("\tinput_blob.dim = src->dim;\n") + f.write("\tIN_DIFF = BUFF + input_blob.dim;\n") + f.write("\tW_DATA = BUFF + 2*input_blob.dim;\n") + f.write("\tupdate_blob();}\n") f.write("\nvoid get_output_dim(void * b){\n") f.write(f"\tstruct blob{suffix} * src = (struct blob{suffix} *) b;\n") f.write("\toutput_blob.C = src->C;\n") f.write("\toutput_blob.H = src->H;\n") f.write("\toutput_blob.W = src->W;\n") - f.write("\toutput_blob.dim = src->dim;}\n") - #f.write("\tOUT_DIFF = BUFF + 2*weight_blob.dim + 2*input_blob.dim + output_blob.dim;\n") - #f.write("\tupdate_blob();}\n") + f.write("\toutput_blob.dim = src->dim;\n") + f.write("\tOUT_DIFF = BUFF + 2*weight_blob.dim + 2*input_blob.dim + output_blob.dim;\n") + f.write("\tupdate_blob();}\n") f.write("\nvoid get_weight_dim(void * b){\n") f.write(f"\tstruct blob{suffix} * src = (struct blob{suffix} *) b;\n") f.write("\tweight_blob.C = src->C;\n") f.write("\tweight_blob.H = src->H;\n") f.write("\tweight_blob.W = src->W;\n") - f.write("\tweight_blob.dim = src->dim;}\n") - #f.write("\tW_DIFF = BUFF + weight_blob.dim + 2*input_blob.dim;\n") - #f.write("\tOUT_DATA = BUFF + 2*weight_blob.dim + 2*input_blob.dim;\n") - #f.write("\tupdate_blob();}\n") - - f.write("\nvoid set_buffer_pointers(void * blob_in, void * blob_wgt, void * blob_out, int compute_in_grad) {\n") - f.write("\tstruct blob * inp_b = (struct blob *) blob_in;\n") - f.write("\tstruct blob * wgt_b = (struct blob *) blob_wgt;\n") - f.write("\tstruct blob * out_b = (struct blob *) blob_out;\n") - f.write("\tIN_DATA = BUFF;\n") - f.write("\tIN_DIFF = IN_DATA + compute_in_grad * inp_b->dim;\n") - f.write("\tW_DATA = IN_DIFF + inp_b->dim;\n") - f.write("\tW_DIFF = W_DATA + wgt_b->dim;\n") - f.write("\tOUT_DATA = W_DIFF + wgt_b->dim;\n") - f.write("\tOUT_DIFF = OUT_DATA + out_b->dim;\n") - f.write("\tupdate_blob();}\n") - - f.write("\nvoid set_buffer_pointers_fp16(void * blob_in, void * blob_wgt, void * blob_out, int compute_in_grad) {\n") - f.write("\tstruct blob_fp16 * inp_b = (struct blob_fp16 *) blob_in;\n") - f.write("\tstruct blob_fp16 * wgt_b = (struct blob_fp16 *) blob_wgt;\n") - f.write("\tstruct blob_fp16 * out_b = (struct blob_fp16 *) blob_out;\n") - f.write("\tIN_DATA = BUFF;\n") - f.write("\tIN_DIFF = IN_DATA + compute_in_grad * inp_b->dim;\n") - f.write("\tW_DATA = IN_DIFF + inp_b->dim;\n") - f.write("\tW_DIFF = W_DATA + wgt_b->dim;\n") - f.write("\tOUT_DATA = W_DIFF + wgt_b->dim;\n") - f.write("\tOUT_DIFF = OUT_DATA + out_b->dim;\n") + f.write("\tweight_blob.dim = src->dim;\n") + f.write("\tW_DIFF = BUFF + weight_blob.dim + 2*input_blob.dim;\n") + f.write("\tOUT_DATA = BUFF + 2*weight_blob.dim + 2*input_blob.dim;\n") f.write("\tupdate_blob();}\n") f.write("\nvoid copy_struct_param(unsigned int from, unsigned int to, int size){\n") @@ -1620,26 +1373,26 @@ def GenerateNet(proj_folder_path, project_name, f.write("\tlinear_args.output = &output_blob;\n") f.write("\tlinear_args.input = &input_blob;\n") f.write("\tlinear_args.coeff = &weight_blob;\n") - # + f.write("\tconv2d_args.output = &output_blob;\n") f.write("\tconv2d_args.input = &input_blob;\n") f.write("\tconv2d_args.coeff = &weight_blob;\n") - # + f.write("\tPW_args.output = &output_blob;\n") f.write("\tPW_args.input = &input_blob;\n") f.write("\tPW_args.coeff = &weight_blob;\n") - # + f.write("\tDW_args.output = &output_blob;\n") f.write("\tDW_args.input = &input_blob;\n") f.write("\tDW_args.coeff = &weight_blob;\n") - # + f.write("\tact_args.output = &output_blob;\n") f.write("\tact_args.input = &input_blob;\n") - # + f.write("\tresconn_args.output = &output_blob;\n") f.write("\tresconn_args.lout = &input_blob;\n") f.write("\tresconn_args.skip = &weight_blob;\n") - # + f.write("\tInstNorm_args.output = &output_blob;\n") f.write("\tInstNorm_args.input = &input_blob;\n") f.write("\tInstNorm_args.coeff = &weight_blob;\n") From 99a235a414f508e2a860ccb3fe43afb8e116943b Mon Sep 17 00:00:00 2001 From: Cristian Cioflan Date: Wed, 3 Jul 2024 18:23:30 +0200 Subject: [PATCH 4/6] Reverted to MSELoss --- tools/TrainLib_Deployer/TrainLib_Deployer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/TrainLib_Deployer/TrainLib_Deployer.py b/tools/TrainLib_Deployer/TrainLib_Deployer.py index 2e630d4a..6498501d 100644 --- a/tools/TrainLib_Deployer/TrainLib_Deployer.py +++ b/tools/TrainLib_Deployer/TrainLib_Deployer.py @@ -79,7 +79,7 @@ batch_size = 1 # BATCHING NOT IMPLEMENTED!! learning_rate = 0.001 optimizer = "SGD" # Name of PyTorch's optimizer -loss_fn = "CrossEntropyLoss" # Name of PyTorch's loss function +loss_fn = "MSELoss" # Name of PyTorch's loss function # ------- NETWORK GRAPH -------- # Manually define the list of the network (each layer in the list has its own properties in the relative index of each list) From 01bc1858c951bc61c87bd4e46be2729fa566fd50 Mon Sep 17 00:00:00 2001 From: Cristian Cioflan Date: Wed, 3 Jul 2024 18:43:30 +0200 Subject: [PATCH 5/6] Fixed minor errors following main merger --- tools/TrainLib_Deployer/TrainLib_Deployer.py | 5 +++-- tools/TrainLib_Deployer/deployer_utils/deployment_utils.py | 2 +- .../deployer_utils/deployment_utils_single_buffer.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/TrainLib_Deployer/TrainLib_Deployer.py b/tools/TrainLib_Deployer/TrainLib_Deployer.py index a517b4b9..e3dbbc72 100644 --- a/tools/TrainLib_Deployer/TrainLib_Deployer.py +++ b/tools/TrainLib_Deployer/TrainLib_Deployer.py @@ -123,7 +123,7 @@ # EXECUTION PROPERTIES NUM_CORES = 8 L1_SIZE_BYTES = 128*(2**10) -USE_DMA = 'SB' # choose whether to load all structures in L1 ('NO') or in L2 and use Single Buffer mode ('SB') or Double Buffer mode ('DB', CURRENTLY UNAVAILABLE) +USE_DMA = 'NO' # choose whether to load all structures in L1 ('NO') or in L2 and use Single Buffer mode ('SB') or Double Buffer mode ('DB', CURRENTLY UNAVAILABLE) # BACKWARD SETTINGS SEPARATE_BACKWARD_STEPS = True # If True, the tool writes separate weight and input gradient in the backward step # PROFILING OPTIONS @@ -437,7 +437,8 @@ def get_precision(self): # Check if the network training fits L1 memocc = composer.DNN_Size_Checker(layer_list, in_ch_list, out_ch_list, hk_list, wk_list, hin_list, win_list, h_str_list, w_str_list, h_pad_list, w_pad_list, - data_type_list, update_layer_list, L1_SIZE_BYTES, USE_DMA, CONV2D_USE_IM2COL) + data_type_list, bias_list, update_layer_list, + L1_SIZE_BYTES, USE_DMA, CONV2D_USE_IM2COL) print("DNN memory occupation: {} bytes of {} available L1 bytes ({}%).".format(memocc, L1_SIZE_BYTES, (memocc/L1_SIZE_BYTES)*100)) diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py index 13a93e02..5f861c7b 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils.py @@ -576,7 +576,7 @@ def GenerateGM(proj_folder_path, project_name, f.write("net = DNN().to(device)\n") f.write("for p in net.parameters():\n") f.write("\tnn.init.normal_(p, mean=0.0, std=1.0)\n") - if (weight_list): + if (weight_l): f.write("from pathlib import Path\n") f.write("basedir = Path(__file__).resolve().parent.parent\n") for layer in range(len(layers_l)): diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py index 26e8ffd5..6e8b3029 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py @@ -59,7 +59,7 @@ def max_wgt_dim(layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l): return RES - def max_bias_dim(layers_l, cout_l, bias_l, data_type_l, update_layer_l): +def max_bias_dim(layers_l, cout_l, bias_l, data_type_l, update_layer_l): nbytes = 4 nbytes_max = 4 RES = 0 From b7fdd952ce71edd30180ae2ef2e7d2c1760a5140 Mon Sep 17 00:00:00 2001 From: Cristian Cioflan Date: Wed, 3 Jul 2024 18:49:18 +0200 Subject: [PATCH 6/6] Fixed errors preparing upstream merger --- tools/TrainLib_Deployer/TrainLib_Deployer.py | 2 +- .../deployment_utils_single_buffer.py | 652 ++++++++++-------- 2 files changed, 372 insertions(+), 282 deletions(-) diff --git a/tools/TrainLib_Deployer/TrainLib_Deployer.py b/tools/TrainLib_Deployer/TrainLib_Deployer.py index e3dbbc72..e16305db 100644 --- a/tools/TrainLib_Deployer/TrainLib_Deployer.py +++ b/tools/TrainLib_Deployer/TrainLib_Deployer.py @@ -123,7 +123,7 @@ # EXECUTION PROPERTIES NUM_CORES = 8 L1_SIZE_BYTES = 128*(2**10) -USE_DMA = 'NO' # choose whether to load all structures in L1 ('NO') or in L2 and use Single Buffer mode ('SB') or Double Buffer mode ('DB', CURRENTLY UNAVAILABLE) +USE_DMA = 'SB' # choose whether to load all structures in L1 ('NO') or in L2 and use Single Buffer mode ('SB') or Double Buffer mode ('DB', CURRENTLY UNAVAILABLE) # BACKWARD SETTINGS SEPARATE_BACKWARD_STEPS = True # If True, the tool writes separate weight and input gradient in the backward step # PROFILING OPTIONS diff --git a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py index 6e8b3029..ce4252de 100644 --- a/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py +++ b/tools/TrainLib_Deployer/deployer_utils/deployment_utils_single_buffer.py @@ -30,32 +30,67 @@ """ DNN Size Checker backend functions """ -def max_input_dim(layers_l, cin_l, hin_l, win_l): +def max_input_dim(layers_l, cin_l, hin_l, win_l, data_type_l, update_layer_l): + nbytes = 4 + nbytes_max = 4 RES = 0 for layer in range(len(layers_l)): - temp = cin_l[layer]*hin_l[layer]*win_l[layer] + # Check data type + if data_type_l[layer] == 'FP32': + nbytes = 4 + elif data_type_l[layer] == 'FP16': + nbytes = 2 + temp = cin_l[layer]*hin_l[layer]*win_l[layer]*nbytes + # Check if the tensor needs to store gradients + if layer > 0 and update_layer_l[layer-1] == 1: + temp = temp * 2 + # Check if maximum is exceeded if temp > RES: + nbytes_max = nbytes RES = temp + RES /= nbytes_max + + print(f"[TEST!!! max_input_dim] RES = {RES}") + + # Result returned in number of elements of the largest input tensor return RES -def max_wgt_dim(layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l): +def max_wgt_dim(layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l, data_type_l, update_layer_l): + nbytes = 4 + nbytes_max = 4 RES = 0 temp = 0 for layer in range(len(layers_l)): + # Check data type + if data_type_l[layer] == 'FP32': + nbytes = 4 + elif data_type_l[layer] == 'FP16': + nbytes = 2 + # Define size depending on layer type if layers_l[layer] == 'conv2d' : - temp = hk_l[layer]*wk_l[layer]*cin_l[layer]*cout_l[layer] + temp = hk_l[layer]*wk_l[layer]*cin_l[layer]*cout_l[layer]*nbytes if layers_l[layer] == 'PW': - temp = cin_l[layer]*cout_l[layer] + temp = cin_l[layer]*cout_l[layer]*nbytes if layers_l[layer] == 'DW': - temp = hk_l[layer]*wk_l[layer]*cin_l[layer] + temp = hk_l[layer]*wk_l[layer]*cin_l[layer]*nbytes if layers_l[layer] == 'linear' : - temp = cin_l[layer]*cout_l[layer] + temp = cin_l[layer]*cout_l[layer]*nbytes if layers_l[layer] == 'Sumnode': - temp = cin_l[layer]*hin_l[layer]*win_l[layer] + temp = cin_l[layer]*hin_l[layer]*win_l[layer]*nbytes + # Check if tensor needs to store gradients + if update_layer_l[layer] == 1: + temp = temp * 2 + # Check if maximum is exceeded if temp > RES: + nbytes_max = nbytes RES = temp + RES /= nbytes_max + + print(f"[TEST!!! max_wgt_dim] RES = {RES}") + + # Result returned in number of elements of the largest weight tensor return RES @@ -92,6 +127,12 @@ def max_layer_dim (layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l, data, h_st tot = 0 max_layer = 0 for layer in range(len(layers_l)): + # Check data type + if data_type_l[layer] == 'FP32': + nbytes = 4 + elif data_type_l[layer] == 'FP16': + nbytes = 2 + # Define size depending on layer if layers_l[layer] == 'conv2d' : tmp_wgt = hk_l[layer]*wk_l[layer]*cin_l[layer]*cout_l[layer]*nbytes if bias_l[layer] == 1: @@ -105,32 +146,41 @@ def max_layer_dim (layers_l, cin_l, hin_l, win_l, cout_l, hk_l, wk_l, data, h_st if bias_l[layer] == 1: tmp_bias = cout_l[layer]*nbytes if layers_l[layer] == 'Sumnode': - temp2 = cin_l[layer]*hin_l[layer]*win_l[layer] + tmp_wgt = cin_l[layer]*hin_l[layer]*win_l[layer]*nbytes if layers_l[layer] == 'InstNorm': - temp2 = 2*cin_l[layer] + tmp_wgt = 2*cin_l[layer]*nbytes if layers_l[layer] in ['ReLU', 'Skipnode']: - temp2 = 0 + tmp_wgt = 0 + # Check if tensor needs to store gradients + if update_layer_l[layer] == 1: + tmp_wgt = tmp_wgt * 2 - temp1 = cin_l[layer]*hin_l[layer]*win_l[layer] + tmp_inp = cin_l[layer]*hin_l[layer]*win_l[layer]*nbytes + # Check if the tensor needs to store gradients + if layer > 0 and update_layer_l[layer-1] == 1: + tmp_inp = tmp_inp * 2 hout = int((hin_l[layer] - hk_l[layer] + 2*h_pad[layer])/h_str[layer] + 1) wout = int((win_l[layer] - wk_l[layer] + 2*w_pad[layer])/w_str[layer] + 1) if layers_l[layer] == 'linear': - temp3 = cout_l[layer] + tmp_out = cout_l[layer]*nbytes else: - temp3 = cout_l[layer] * hout * wout + tmp_out = cout_l[layer] * hout * wout * nbytes + # Check if the tensor needs to store gradients + if update_layer_l[layer] == 1: + tmp_out = tmp_out * 2 + tot = tmp_inp + tmp_wgt + tmp_out + tmp_bias print(f"Layer {layer} ({layers_l[layer]}): Input: {tmp_inp}, Coefficients: {tmp_wgt}, Biases: {tmp_bias}, Output: {tmp_out}, Total: {tot} (data + gradients + biases)") if tot > RES: + nbytes_max = nbytes RES = tot max_layer = layer - multiplier = 2 - if data == 'FP32': - multiplier = 4 - RES = 2*multiplier*RES #The 2 factor accounts for for both data and diff storage print(f"Max Layer size (including data and gradients): {RES} bytes @layer {max_layer}") - return RES + + # Result returned in bytes of the largest layer + size in bytes of the largest layer + return RES, nbytes_max """ @@ -143,7 +193,7 @@ def GenerateNet(proj_folder_path, project_name, layers_l, in_ch_l, out_ch_l, hk_l, wk_l, hin_l, win_l, h_str_l, w_str_l, h_pad_l, w_pad_l, epochs, batch_size, learning_rate, optimizer, loss_fn, - data_type_l, weights_l, bias_l, update_layer_l, sumnode_connections, MAX_LAYER_DIM, + data_type_l, bias_l, update_layer_l, sumnode_connections, MAX_LAYER_DIM, PROFILE_SINGLE_LAYERS, SEPARATE_BACKWARD_STEPS, CONV2D_USE_IM2COL, PRINT_TRAIN_LOSS): data_type = data_type_l[0] @@ -197,7 +247,7 @@ def GenerateNet(proj_folder_path, project_name, f.write("void reset_arguments();\n") f.write("void update_blob();\n") f.write("void reset_dim();\n") - + f.write(f"\n// Max tensor and layer sizes\n") f.write(f"#define MAX_IN_SIZE {int(max_input_dim(layers_l, in_ch_l, hin_l, win_l, data_type_l, update_layer_l))}\n") f.write(f"#define MAX_WGT_SIZE {int(max_wgt_dim(layers_l, in_ch_l, hin_l, win_l, out_ch_l, hk_l, wk_l, data_type_l, update_layer_l))}\n") @@ -212,7 +262,7 @@ def GenerateNet(proj_folder_path, project_name, f.write(f"\n// Partial update constants\n") f.write(f"#define PU_SKIP_IN_GRAD 0\n") f.write(f"#define PU_COMP_IN_GRAD 1\n") - + f.close() @@ -373,7 +423,7 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): # Define FP32 tensors if data_type_l[layer] == 'FP32': - if layers_l[layer] == 'MaxPool' or layers_l[layer] == 'AvgPool': + if layers_l[layer] in ['MaxPool', 'AvgPool', 'ReLU']: f.write("PI_L2 float l"+str(layer)+"_ker[1];\n") f.write("PI_L2 float l"+str(layer)+"_bias[1];\n") elif layers_l[layer] == 'Skipnode' or layers_l[layer] == 'Sumnode': @@ -386,7 +436,7 @@ def GenerateNet(proj_folder_path, project_name, f.write("PI_L2 float l"+str(layer)+"_bias[Tout_C_l"+str(layer)+"];\n") # Define FP16 tensors elif data_type_l[layer] == 'FP16': - if layers_l[layer] == 'MaxPool' or layers_l[layer] == 'AvgPool': + if layers_l[layer] in ['MaxPool', 'AvgPool', 'ReLU']: f.write("PI_L2 fp16 l"+str(layer)+"_ker[1];\n") f.write("PI_L2 fp16 l"+str(layer)+"_bias[1];\n") elif layers_l[layer] == 'Skipnode' or layers_l[layer] == 'Sumnode': @@ -406,7 +456,7 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): # Define FP32 tensors if data_type_l[layer] == 'FP32': - if layers_l[layer] == 'MaxPool' or layers_l[layer] == 'AvgPool': + if layers_l[layer] in ['MaxPool', 'AvgPool', 'ReLU']: f.write("PI_L2 float l"+str(layer)+"_ker_diff[1];\n") f.write("PI_L2 float l"+str(layer)+"_bias_diff[1];\n") elif layers_l[layer] == 'Skipnode' or layers_l[layer] == 'Sumnode': @@ -419,7 +469,7 @@ def GenerateNet(proj_folder_path, project_name, f.write("PI_L2 float l"+str(layer)+"_bias_diff[Tout_C_l"+str(layer)+"];\n") # Define FP16 tensors elif data_type_l[layer] == 'FP16': - if layers_l[layer] == 'MaxPool' or layers_l[layer] == 'AvgPool': + if layers_l[layer] in ['MaxPool', 'AvgPool', 'ReLU']: f.write("PI_L2 fp16 l"+str(layer)+"_ker_diff[1];\n") elif layers_l[layer] == 'Skipnode' or layers_l[layer] == 'Sumnode': pass @@ -466,7 +516,7 @@ def GenerateNet(proj_folder_path, project_name, im2col_byte_length = 0 im2col_max_data_type = 'FP32' for layer in range(len(layers_l)): - if layers_l[layer] == 'conv2d': # or layers_l[layer] == 'DW': + if layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True: # or layers_l[layer] == 'DW': if data_type_l[layer] == 'FP32': im2col_byte_length = 4 elif data_type_l[layer] == 'FP16': @@ -504,7 +554,6 @@ def GenerateNet(proj_folder_path, project_name, else: print("[deployment_utils.GenerateNet] Invalid data type for im2col!!") exit() - # No im2col buffer allocate_no_im2col = False for layer in range(len(layers_l)): @@ -522,10 +571,10 @@ def GenerateNet(proj_folder_path, project_name, for layer in range(len(layers_l)): # Check layer data layout data_layout = 'CHW' # Change to input list of data layouts - if (layers_l[layer] == 'conv2d' or layers_l[layer] == 'PW') and layer == 0: + if ((layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True) or layers_l[layer] == 'PW') and layer == 0: bt_flag = True bt_layer_index = 0 - elif (layers_l[layer] == 'conv2d' or layers_l[layer] == 'PW') and layer > 0: + elif ((layers_l[layer] == 'conv2d' and CONV2D_USE_IM2COL == True) or layers_l[layer] == 'PW') and layer > 0: bt_flag = True bt_mem = in_ch_l[layer] * hk_l[layer] * wk_l[layer] * out_ch_l[layer] if bt_mem > bt_max_memocc: @@ -566,7 +615,16 @@ def GenerateNet(proj_folder_path, project_name, else: print("[deployment_utils.GenerateNet] Invalid data type for pw transp buffer definition!\n") exit() + # No blocktranspose buffer + if (bt_flag == False): + print("No blockstranspose buffer detected.\n") + f.write("PI_L1 float bt_buffer[1];\n") + # Define label buffer + if data_type_l[-1] == 'FP32': + f.write("PI_L1 float label_temp[Tout_C_l"+str(len(layers_l)-1)+"*Tout_H_l"+str(len(layers_l)-1)+"*Tout_W_l"+str(len(layers_l)-1)+"];\n") + elif data_type_l[-1] == 'FP16': + f.write("PI_L1 fp16 label_temp[Tout_C_l"+str(len(layers_l)-1)+"*Tout_H_l"+str(len(layers_l)-1)+"*Tout_W_l"+str(len(layers_l)-1)+"];\n") # Define tensors to backpropagate the output error f.write("\n// Define error propagation tensors\n") @@ -589,7 +647,48 @@ def GenerateNet(proj_folder_path, project_name, exit() - + # Normalization layer running stats + f.write("\n// Define running parameters for normalization layers (L2)\n") + for layer in range(len(layers_l)): + if layers_l[layer] == 'InstNorm': + if data_type_l[layer] == 'FP32': + f.write("PI_L2 float l"+str(layer)+"_running_mean[Tin_C_l"+str(layer)+"];\n") + f.write("PI_L2 float l"+str(layer)+"_running_var[Tin_C_l"+str(layer)+"];\n") + f.write("PI_L2 float l"+str(layer)+"_running_stdev[Tin_C_l"+str(layer)+"];\n") + elif data_type_l[layer] == 'FP16': + f.write("PI_L2 fp16 l"+str(layer)+"_running_mean[Tin_C_l"+str(layer)+"];\n") + f.write("PI_L2 fp16 l"+str(layer)+"_running_var[Tin_C_l"+str(layer)+"];\n") + f.write("PI_L2 fp16 l"+str(layer)+"_running_stdev[Tin_C_l"+str(layer)+"];\n") + # Define L1 buffer for Norm parameters + f.write("\n// L1 buffers for normalization layers\n") + norm_temp_buffer_present = False + max_size = 0 + max_num_bytes = 4 + for layer in range(len(layers_l)): + temp_size = 0 + if layers_l[layer] == 'InstNorm': + num_bytes = 4 + if data_type_l[layer] == 'FP16': + num_bytes = 2 + temp_size = in_ch_l[layer] * num_bytes # 3 * in_ch_l[layer] * num_bytes + if max_size < temp_size: + max_size = temp_size + max_num_bytes = num_bytes + norm_temp_buffer_present = True + if norm_temp_buffer_present: + if max_num_bytes == 4: + f.write("PI_L1 float running_mean_buffer["+str(int(max_size / max_num_bytes))+"];\n") + f.write("PI_L1 float running_var_buffer["+str(int(max_size / max_num_bytes))+"];\n") + f.write("PI_L1 float running_stdev_buffer["+str(int(max_size / max_num_bytes))+"];\n") + elif max_num_bytes == 2: + f.write("PI_L1 fp16 running_mean_buffer["+str(int(max_size / max_num_bytes))+"];\n") + f.write("PI_L1 fp16 running_var_buffer["+str(int(max_size / max_num_bytes))+"];\n") + f.write("PI_L1 fp16 running_stdev_buffer["+str(int(max_size / max_num_bytes))+"];\n") + else: + print("[deployment_utils_single_buffer.py/GenerateNet] Invalid data type for running stats!!") + exit() + + # Define buffer for mixed precision propagation previous_type = data_type_l[0] is_mixed_precision = False @@ -667,12 +766,11 @@ def GenerateNet(proj_folder_path, project_name, f.write("update_blob();\n") f.write("reset_arguments();\n\n") - for layer in range(len(layers_l)): # First layer if layer == 0: - f.write(" // Layer "+str(layer)+"\n") - f.write(" for(int i=0; i