From 2d589e2c6609b3783865d92c0cd020ee4f826f5c Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Wed, 4 Feb 2026 14:25:48 +0100
Subject: [PATCH 01/10] Remove legacy dma test + update CLI command to new
 interface + Fix paths in Deeploy 101 tutorial

---
 CONTRIBUTING.md                               |  2 +-
 .../PULPOpen/Templates/GEMMTemplate.py        |  2 +-
 DeeployTest/testDmas.py                       | 72 -------------------
 docs/install.md                               |  2 +-
 docs/tutorials/introduction.md                | 42 +++++------
 5 files changed, 24 insertions(+), 96 deletions(-)
 delete mode 100644 DeeployTest/testDmas.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2285670366..8fee9fc1d7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -33,7 +33,7 @@ Additionally, add the title and link to the pull request in the list of pull req
 [...]
 
 ### Removed
-- Remove the link to the precompiled LLVM 12 in the `testRunner` for Snitch and in the CI.
+- Remove the link to the precompiled LLVM 12 in the `deeployRunner` for Snitch and in the CI.
 [...]
 ```
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py
index fbe475b8df..bcc4b1fc91 100644
--- a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py
@@ -50,7 +50,7 @@ def alignToContext(self, ctxt: NetworkContext,
 // LMACAN: In some edge cases sporadic errors happen if this loop is not added.
 // We believe this is due to missing bubbles in the pipeline that break operator forwarding.
 // Breaking test:
-//   `python testRunner_tiled_siracusa.py -t=Tests/Models/Transformer --defaultMemLevel=L3 --doublebuffer --l1=30000`
+//   `python deeployRunner_tiled_siracusa.py -t=Tests/Models/Transformer --defaultMemLevel=L3 --doublebuffer --l1=30000`
 #pragma unroll 1
 for(int k=0;k<3;k++){
   asm volatile("nop" ::);
diff --git a/DeeployTest/testDmas.py b/DeeployTest/testDmas.py
deleted file mode 100644
index df6926b48d..0000000000
--- a/DeeployTest/testDmas.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import itertools
-import subprocess
-from typing import Tuple
-
-
-def test(dma: str, inputShape: Tuple[int, ...], tileShape: Tuple[int, ...], nodeCount: int, dataType: str,
-         doublebuffer: bool):
-    cfg_str = f"""
-    - input shape: {inputShape}
-    - tile shape: {tileShape}
-    - node count: {nodeCount}
-    - data type: {dataType}
-    - doublebuffering: {doublebuffer}
-    - dma: {dma}
-    """
-
-    print(f"test{dma}: Testing {dma} with followig configuration:" + cfg_str)
-
-    testRunnerMap = {
-        "MchanDma": "testRunner_siracusa_mchandma.py",
-        "L3Dma": "testRunner_siracusa_l3dma.py",
-        "SnitchDma": "testRunner_snitch_dma.py",
-    }
-
-    assert dma in testRunnerMap, f"{dma} missing its own testRunner mapping"
-
-    testRunner = testRunnerMap[dma]
-
-    cmd = [f"python {testRunner}", f"-t test{dma}", "-DNUM_CORES=8"]
-    cmd.append(f"--input-shape {' '.join(str(x) for x in inputShape)}")
-    cmd.append(f"--tile-shape {' '.join(str(x) for x in tileShape)}")
-    cmd.append(f"--node-count {nodeCount}")
-    cmd.append(f"--type {dataType}")
-    if doublebuffer:
-        cmd.append("--doublebuffer")
-
-    full_cmd = " ".join(cmd)
-
-    print(f"Running command:\n{full_cmd}\n")
-
-    try:
-        subprocess.run(full_cmd, shell = True, check = True)
-    except subprocess.CalledProcessError:
-        print(f"test{dma}: Failed test:" + cfg_str)
-        print(f"Rerun with command:\n{full_cmd}")
-        exit(-1)
-
-
-# input shape, tile shape, node count, data type
-test_shapes_and_more = [
-    ((10, 10), (10, 10), 1, "uint8_t"),
-    ((10, 10), (10, 4), 1, "uint8_t"),
-    ((10, 10), (10, 4), 1, "uint16_t"),
-    ((10, 10), (10, 4), 1, "uint32_t"),
-    ((10, 10), (3, 4), 1, "uint32_t"),
-    ((10, 10), (3, 4), 2, "uint32_t"),
-    ((10, 10, 10), (2, 3, 4), 1, "uint8_t"),
-    ((10, 10, 10, 10), (2, 3, 5, 4), 1, "uint8_t"),
-    ((10, 10, 10, 10), (2, 3, 5, 4), 1, "uint32_t"),
-    ((10, 10, 10, 10, 10), (2, 3, 5, 7, 4), 1, "uint8_t"),
-]
-
-is_doublebuffers = [True, False]
-dmas = ["MchanDma", "L3Dma", "SnitchDma"]
-
-for testShape, doublebuffer, dma in itertools.product(test_shapes_and_more, is_doublebuffers, dmas):
-    inputShape, tileShape, nodeCount, dataType = testShape
-    test(dma, inputShape, tileShape, nodeCount, dataType, doublebuffer)
diff --git a/docs/install.md b/docs/install.md
index ca85e1acea..65d7e64926 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -76,5 +76,5 @@ For example, you can run
 
 ```
 cd DeeployTest
-python testRunner_generic.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_generic.py -t Tests/Models/CNN_Linear1
 ```
diff --git a/docs/tutorials/introduction.md b/docs/tutorials/introduction.md
index 1e638bd8a8..7002954432 100644
--- a/docs/tutorials/introduction.md
+++ b/docs/tutorials/introduction.md
@@ -34,15 +34,15 @@ cd Deeploy
 pip install -e . --extra-index-url=https://pypi.ngc.nvidia.com
 ```
 
-From the `DeeployTest` folder, you can use the `testRunner` to compile ONNXs and execute the output code using the appropriate simulators.
+From the `DeeployTest` folder, you can use the `deeployRunner` to compile ONNXs and execute the output code using the appropriate simulators.
 
 To validate your installation, you can run a simple Add node on each platform:
 ```
-python testRunner_generic.py -t Tests/IntKernels/Add/Regular
-python testRunner_cortexm.py -t Tests/IntKernels/Add/Regular
-python testRunner_mempool.py -t Tests/IntKernels/Add/Regular
-python testRunner_snitch.py -t Tests/IntKernels/Add/Regular/
-python testRunner_siracusa.py -t Tests/IntKernels/Add/Regular --cores=8
+python deeployRunner_generic.py -t Tests/Kernels/Integer/Add/Regular
+python deeployRunner_cortexm.py -t Tests/Kernels/Integer/Add/Regular
+python deeployRunner_mempool.py -t Tests/Kernels/Integer/Add/Regular
+python deeployRunner_snitch.py -t Tests/Kernels/Integer/Add/Regular
+python deeployRunner_siracusa.py -t Tests/Kernels/Integer/Add/Regular --cores=8
 ```
 Once all these basic tests are passed, we can jump into the basics of Deeploy.
 
@@ -57,7 +57,7 @@ Hence, Deeploy's inputs are:
 
 Deeploy is shipped with a comprehensive testing framework conveniently named DeeployTest. This testing framework contains Test Runners for end-to-end testing of your network on a given platform. More specifically, a Test Runner compiles a given ONNX file, builds the project, feeds the inputs into the compiled neural network, and compares the output with the golden values to ensure correctness.
 
-If you followed this tutorial correctly, you already used Test Runners (e.g., `testRunner_siracusa.py`) to validate the Deeploy installation! We will dive into the details of the Test Runners CLI very soon, but first, let's look at the tools and libraries used downstream in Deeploy.
+If you followed this tutorial correctly, you already used Test Runners (e.g., `deeployRunner_siracusa.py`) to validate the Deeploy installation! We will dive into the details of the Test Runners CLI very soon, but first, let's look at the tools and libraries used downstream in Deeploy.
 
 The figure below gives an overview of the deployment stack. As you can see, there are several steps to take before actually running the application. For the build system (*e.g.,* the tool to organize compilation and linking), we use [CMake](https://cmake.org/). The default C compiler shipped with Deeploy is [LLVM 15](https://llvm.org/), but it supports GCC, given that you provide a local installation. To generate the Application Binary, we link the Network Code with the necessary Kernel Libraries and a Standard C Library (here [Picolibc](https://github.com/picolibc/picolibc)). Then, we feed this Application Binary to the appropriate simulator; from there, you can verify the correctness and benchmark the application.
 
@@ -67,9 +67,9 @@ The figure below gives an overview of the deployment stack. As you can see, ther
 
 You can visualize the ONNX graphs using [Netron](https://netron.app/). Either use the web interface or install the python package with `pip install netron`.
 
-> ✅ **Task:** Visualize the ONNX graph of the `IntKernels/Add/Regular`, `Models/MobileNetv2`, and `Others/Transformer`
+> ✅ **Task:** Visualize the ONNX graph of the `Tests/Kernels/Integer/Add/Regular`, `Tests/Models/MobileNetv2`, and `Tests/Models/Transformer`
 
-The ONNX graphs are in `DeeployTest/Tests/<TestName>/network.onnx`. The networks are increasing in complexity, `IntKernels/Add/Regular` is a single node network for unit testing, while `Models/MobileNetv2` is a simple sequential network mostly made of convolutions. Finally, the `Others/Transformer` network showcases a typical transformer block used in Encoder and Decoder networks. If you want to peek at a complex network, you can visualize `Models/microLlama/microLlama128`.
+The ONNX graphs are in `DeeployTest/Tests/<TestName>/network.onnx`. The networks are increasing in complexity, `IntKernels/Add/Regular` is a single node network for unit testing, while `Tests/Models/MobileNetv2` is a simple sequential network mostly made of convolutions. Finally, the `Tests/Models/Transformer` network showcases a typical transformer block used in Encoder and Decoder networks. If you want to peek at a complex network, you can visualize `Models/microLlama/microLlama128`.
 
 Now that we understand Deeploy's input, let's check the output-generated code!
 
@@ -77,15 +77,15 @@ Now that we understand Deeploy's input, let's check the output-generated code!
 
 The generated code is located in the following directory: `DeeployTest/TEST_<PlatformName>/Tests`, and the `Network.c` file is the interesting one.
 
-The generated code is trivial for the `IntKernels/Add/Regular` graph; we simply use the template for the `Add` node of the Generic platform. You can find the template declaration in `Deeploy/Targets/Generic/Templates/AddTemplate.py`.
+The generated code is trivial for the `Tests/Kernels/Integer/Add/Regular` graph; we simply use the template for the `Add` node of the Generic platform. You can find the template declaration in `Deeploy/Targets/Generic/Templates/AddTemplate.py`.
 
-Now, if you want to look at something a bit more complex, run `python testRunner_generic.py  -t ./Tests/Models/miniMobileNetv2` (from `DeeployTest`) and look at the generated code. There are two interesting points you can notice:
+Now, if you want to look at something a bit more complex, run `python deeployRunner_generic.py  -t ./Tests/Models/miniMobileNetv2` (from `DeeployTest`) and look at the generated code. There are two interesting points you can notice:
 - We hoist the constants at the top of the file.
-- In the `RunNetwork` function, we sequentially have node templates to execute the operands and malloc/free to manage the memory. You can open the ONNX graph of `Models/miniMobileNetv2` on the side to try to match the nodes of the graph with their generated code.
+- In the `RunNetwork` function, we sequentially have node templates to execute the operands and malloc/free to manage the memory. You can open the ONNX graph of `Tests/Models/miniMobileNetv2` on the side to try to match the nodes of the graph with their generated code.
 
 > ✅ **Task:** Visualize the effect of passes on the ONNX graph for the Siracusa platform.
 
-Deeploy applies passes on the ONNX graph to transform its topology and optimize its execution. Let's visualize the effect of the passes used in the Siracusa Platform. First, let's execute our `miniMobileNetv2` on Siracusa with `python testRunner_siracusa.py  -t ./Tests/Models/miniMobileNetv2`. You can find the original ONNX graph at `DeeployTest/Tests/Models/miniMobileNetv2/network.onnx`, and the transformed ONNX graph at `DeeployTest/TEST_SIRACUSA/Tests/Models/miniMobileNetv2/deeployStates/backend_post_binding.onnx`. Open both ONNX graphs side by side to compare them.
+Deeploy applies passes on the ONNX graph to transform its topology and optimize its execution. Let's visualize the effect of the passes used in the Siracusa Platform. First, let's execute our `miniMobileNetv2` on Siracusa with `python deeployRunner_siracusa.py  -t ./Tests/Models/miniMobileNetv2`. You can find the original ONNX graph at `Tests/Models/miniMobileNetv2/network.onnx`, and the transformed ONNX graph at `TEST_SIRACUSA/Tests/Models/miniMobileNetv2/deeployStates/backend_post_binding.onnx`. Open both ONNX graphs side by side to compare them.
 
 You can notice the effect of two passes on the graph:
 - One pass fuses the `Conv` and `RequantShift` nodes. This is a common technique named [Operator Fusion](https://medium.com/data-science/how-pytorch-2-0-accelerates-deep-learning-with-operator-fusion-and-cpu-gpu-code-generation-35132a85bd26) and used in many DNN compilers.
@@ -135,12 +135,12 @@ Now that you understand the hardware and the kind of workload we want to execute
 
 > ✅ **Task:** Measure and compare the runtime of the `microLlama128` model using 1 and 8 cores. Compute the speedup ratio; why is it not 8?
 
-*Hint:* `python testRunner_siracusa.py --help` will list and explain the available flags.
+*Hint:* `python deeployRunner_siracusa.py --help` will list and explain the available flags.
 
 <details>
  <summary><span style="font-weight: bold; font-size: 1.3em;">Solution</span></summary>
 
- > If you run `python testRunner_siracusa.py -t Tests/Models/microLlama/microLlama128 --cores=1` and then `python testRunner_siracusa.py -t Tests/Models/microLlama/microLlama128 --cores=8`, you should measure a runtime of ~16,1M cycles for 1 core and 3.1M cycles for 8 cores.
+ > If you run `python deeployRunner_siracusa.py -t Tests/Models/microLlama/microLlama128 --cores=1` and then `python deeployRunner_siracusa.py -t Tests/Models/microLlama/microLlama128 --cores=8`, you should measure a runtime of ~16,1M cycles for 1 core and 3.1M cycles for 8 cores.
  >
  > The speedup ratio is obtained via $\frac{\text{Runtime 1 cores}}{\text{Runtime 8 cores}} = 5.2$. Hence, using 8 cores instead of 1 leads to a 5.2 times speedup.
  >
@@ -149,22 +149,22 @@ Now that you understand the hardware and the kind of workload we want to execute
 
 ### Tiling Basics
 
-It's due time to talk about data movement now! We use all 8 cluster cores, which is great, but where do these cores fetch the data from? By default, when using `testRunner_siracusa.py`, all data is in L2; there is no tiling, and cores read and write data directly to/from L2. As the L2 memory is "further away" from the cluster, load/store takes several cycles, which is non-optimal.
+It's due time to talk about data movement now! We use all 8 cluster cores, which is great, but where do these cores fetch the data from? By default, when using `deeployRunner_siracusa.py`, all data is in L2; there is no tiling, and cores read and write data directly to/from L2. As the L2 memory is "further away" from the cluster, load/store takes several cycles, which is non-optimal.
 
 What we really want is to use the L1 memory, which provides 1 cycle latency load/store! But as the capacity is relatively small (256KB), we need to **tile our layers**. Tiling operands for an accelerator featuring only scratchpad memories is not trivial (unlike in architectures with data caches). For each layer, the compiler has to decide on tile size, a tiling schedule, a buffering strategy (single buffer, double buffer, etc...), and a memory allocation strategy. Then, the compiler must generate the code to configure and launch each transfer and place barriers accordingly to maximize concurrency.
 
 The good news is that Deeploy can already do that! So, let's generate and run some tiled code to see the impact of tiling on the runtime.
 
-> ✅ **Task:** Get familiar with the CLI arguments of `testRunner_tiled_siracusa.py`, then run `microLlama64_parallel` with different configurations. Find one "bad" and one "good" configuration, and explain why.
+> ✅ **Task:** Get familiar with the CLI arguments of `deeployRunner_tiled_siracusa.py`, then run `microLlama64_parallel` with different configurations. Find one "bad" and one "good" configuration, and explain why.
 
 *Hint:* Use the `--help` flag to list and explain the available flags.
 
 <details>
  <summary><span style="font-weight: bold; font-size: 1.3em;">Solution</span></summary>
 
- > Bad configuration: `python testRunner_tiled_siracusa.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 8000 --defaultMemLevel=L2` -> Runtime: 47.5 MCycles
+ > Bad configuration: `python deeployRunner_tiled_siracusa.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 8000 --defaultMemLevel=L2` -> Runtime: 47.5 MCycles
  >
- > Good configuration `python testRunner_tiled_siracusa.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 64000 --defaultMemLevel=L2`: -> Runtime: 35.3 MCycles
+ > Good configuration `python deeployRunner_tiled_siracusa.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 64000 --defaultMemLevel=L2`: -> Runtime: 35.3 MCycles
  >
  > Justification: As the size of the L1 memory gets smaller, tiles also get smaller and smaller. Smaller tiles usually mean that it's harder to keep the core properly utilized.
 
@@ -185,7 +185,7 @@ With this profiling trace, you can clearly measure the overhead of DMA transfers
 
 ### Using the NPU and the Neural Memory Subsystem (NMS)
 
-To use the NPU, you can use the `testRunner_tiled_siracusa_w_neureka.py`. The Linear layers will automatically be executed by the NPU. To enable the NMS, use the `--neureka-wmem` flag. When the NMS is enabled, the constant tensors used by the accelerator will be placed in the Weight Memory.
+To use the NPU, you can use the `deeployRunner_tiled_siracusa_w_neureka.py`. The Linear layers will automatically be executed by the NPU. To enable the NMS, use the `--neureka-wmem` flag. When the NMS is enabled, the constant tensors used by the accelerator will be placed in the Weight Memory.
 
 > ✅ **Task:** Execute Micro Llama in parallel and autoregressive mode using the NPU, derive the speedup at the model level and at the layer level compared to execution without NPU.
 
@@ -199,7 +199,7 @@ To use the NPU, you can use the `testRunner_tiled_siracusa_w_neureka.py`. The Li
  > The runtime in parallel mode with NPU is obtained with:
  >
  >`
- python testRunner_tiled_siracusa_w_neureka.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 64000 --defaultMemLevel=L2
+ python deeployRunner_tiled_siracusa_w_neureka.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 64000 --defaultMemLevel=L2
  `
  >
  > And returns 28.6 MCycles of runtime. The runtime without NPU was measured above and is 35.3 MCycles. Hence, the speedup is ~1.23 times.

From 783dec5d2b025b31ccdbf7692538be1568aab6c1 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Wed, 4 Feb 2026 14:30:31 +0100
Subject: [PATCH 02/10] Update CHANGELOG

---
 CHANGELOG.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc185c7459..568789f97e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 
 
 ### List of Pull Requests
-- 
+- Update CLI interface Across Project, Fix Tutorial, and Remove Legacy Test [#157](https://github.com/pulp-platform/Deeploy/pull/157)
 
 ### Added
 - 
@@ -159,6 +159,8 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode
 - changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection
 - Print kernel profiling information for all memory levels
+- Aligned CLI commands across the project
+- Fix test paths in Deeploy 101 tutorial
 
 ### Fixed
 - Update `install.md` to remove rust mention and fix test command.
@@ -190,6 +192,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ### Removed
 - Delete outdated and unused `.gitlab-ci.yml` file
 - dory_dma.c and dory_dma.h
+- `testDMA.py` was an old test; we now have `test_dmas.py` instead.
 
 ## Release v0.2.0 (2025-07-08) [#103](https://github.com/pulp-platform/Deeploy/pull/103)
 This release containing major architectural changes, new platform support, enhanced simulation workflows, floating-point kernel support, training infrastructure for CCT models, memory allocation strategies, and documentation improvements.

From cd11d17882347d567a2a2d9e2ef93146b481dcb9 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Wed, 4 Feb 2026 14:58:10 +0100
Subject: [PATCH 03/10] Update Deeploy 101 tutorial

---
 docs/tutorials/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorials/introduction.md b/docs/tutorials/introduction.md
index 7002954432..8d7bf69393 100644
--- a/docs/tutorials/introduction.md
+++ b/docs/tutorials/introduction.md
@@ -69,7 +69,7 @@ You can visualize the ONNX graphs using [Netron](https://netron.app/). Either us
 
 > ✅ **Task:** Visualize the ONNX graph of the `Tests/Kernels/Integer/Add/Regular`, `Tests/Models/MobileNetv2`, and `Tests/Models/Transformer`
 
-The ONNX graphs are in `DeeployTest/Tests/<TestName>/network.onnx`. The networks are increasing in complexity, `IntKernels/Add/Regular` is a single node network for unit testing, while `Tests/Models/MobileNetv2` is a simple sequential network mostly made of convolutions. Finally, the `Tests/Models/Transformer` network showcases a typical transformer block used in Encoder and Decoder networks. If you want to peek at a complex network, you can visualize `Models/microLlama/microLlama128`.
+The ONNX graphs are in `DeeployTest/Tests/<TestName>/network.onnx`. The networks are increasing in complexity, `Tests/Kernels/Integer/Add/Regular` is a single node network for unit testing, while `Tests/Models/MobileNetv2` is a simple sequential network mostly made of convolutions. Finally, the `Tests/Models/Transformer` network showcases a typical transformer block used in Encoder and Decoder networks. If you want to peek at a complex network, you can visualize `Models/microLlama/microLlama128`.
 
 Now that we understand Deeploy's input, let's check the output-generated code!
 

From c382fc6231f2e3ded69c99d89e68c49a7014d862 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Fri, 6 Feb 2026 09:52:31 +0100
Subject: [PATCH 04/10] Update CHANGELOG

---
 CHANGELOG.md | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 568789f97e..1c8698b5ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,13 +11,13 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - 
 
 ### Changed
-- 
+- Aligned CLI commands across the project
 
 ### Fixed
-- 
+- Fix test paths in Deeploy 101 tutorial
 
 ### Removed
-- 
+- `testDMA.py` was an old test; we now have `test_dmas.py` instead.
 
 ## Release v0.2.1 (2026-02-05) [#158](https://github.com/pulp-platform/Deeploy/pull/158)
 
@@ -159,8 +159,6 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode
 - changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection
 - Print kernel profiling information for all memory levels
-- Aligned CLI commands across the project
-- Fix test paths in Deeploy 101 tutorial
 
 ### Fixed
 - Update `install.md` to remove rust mention and fix test command.
@@ -192,7 +190,6 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ### Removed
 - Delete outdated and unused `.gitlab-ci.yml` file
 - dory_dma.c and dory_dma.h
-- `testDMA.py` was an old test; we now have `test_dmas.py` instead.
 
 ## Release v0.2.0 (2025-07-08) [#103](https://github.com/pulp-platform/Deeploy/pull/103)
 This release containing major architectural changes, new platform support, enhanced simulation workflows, floating-point kernel support, training infrastructure for CCT models, memory allocation strategies, and documentation improvements.

From 9a8eef6cfe5c5763e8446a3aa6b880544094c314 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Fri, 6 Feb 2026 09:53:10 +0100
Subject: [PATCH 05/10] Fix publish action

---
 .github/workflows/publish.yml | 3 ---
 Container/Dockerfile.deeploy  | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 93c0147fd2..77b5584892 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -14,9 +14,6 @@ permissions:
   contents: read
   id-token: write
 
-env:
-  UV_EXTRA_INDEX_URL: https://pypi.ngc.nvidia.com
-
 jobs:
   publish-pypi:
     name: Publish to PyPI
diff --git a/Container/Dockerfile.deeploy b/Container/Dockerfile.deeploy
index 5266b84982..59dc2d6dda 100644
--- a/Container/Dockerfile.deeploy
+++ b/Container/Dockerfile.deeploy
@@ -87,9 +87,6 @@ WORKDIR /app
 
 COPY pyproject.toml ./
 
-# Add nvidia channel to the pip configuration
-RUN mkdir -p /etc && printf "[global]\nextra-index-url = https://pypi.ngc.nvidia.com\n" > /etc/pip.conf
-
 # Install dependencies
 RUN apt-get update && \
     apt-get install -y git-lfs \

From 7cc06f0d2b102584678af0becf8573d3ddfff50a Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Fri, 6 Feb 2026 10:04:05 +0100
Subject: [PATCH 06/10] Add build-only action to run before preparing a new
 release

---
 .github/workflows/package-publish.yml | 88 +++++++++++++++++++++++++++
 .github/workflows/publish.yml         | 62 -------------------
 2 files changed, 88 insertions(+), 62 deletions(-)
 create mode 100644 .github/workflows/package-publish.yml
 delete mode 100644 .github/workflows/publish.yml

diff --git a/.github/workflows/package-publish.yml b/.github/workflows/package-publish.yml
new file mode 100644
index 0000000000..1584e89f42
--- /dev/null
+++ b/.github/workflows/package-publish.yml
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: Package • Publish to PyPi
+
+on:
+  push:
+    tags:
+      - "v*"
+  workflow_dispatch:
+    inputs:
+      publish_target:
+        description: "Select what to do when manually triggering the workflow"
+        required: false
+        default: build-only
+        type: choice
+        options:
+          - build-only
+          - test-pypi
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  build-package:
+    name: Build package artifacts
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Build artifacts
+        run: uv build
+
+      - name: Test wheel installation
+        run: uv run --isolated --no-project --with dist/*.whl python -c "import Deeploy"
+
+      - name: Test sdist installation
+        run: uv run --isolated --no-project --with dist/*.tar.gz python -c "import Deeploy"
+
+      - name: Upload build artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: deeploy-dist
+          path: dist
+          if-no-files-found: error
+
+  publish-pypi:
+    name: Publish to PyPI
+    if: github.event_name == 'push'
+    needs: build-package
+    runs-on: ubuntu-latest
+    steps:
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Download build artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: deeploy-dist
+          path: dist
+
+      - name: Publish to PyPI
+        run: uv publish dist/*
+
+  publish-test-pypi:
+    name: Publish to Test PyPI
+    if: github.event_name == 'workflow_dispatch' && inputs.publish_target == 'test-pypi'
+    needs: build-package
+    runs-on: ubuntu-latest
+    steps:
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Download build artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: deeploy-dist
+          path: dist
+
+      - name: Publish to Test PyPI
+        run: uv publish dist/* --publish-url https://test.pypi.org/legacy/
+
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
deleted file mode 100644
index 77b5584892..0000000000
--- a/.github/workflows/publish.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
-#
-# SPDX-License-Identifier: Apache-2.0
-
-name: Publish
-
-on:
-  push:
-    tags:
-      - "v*"
-  workflow_dispatch:
-
-permissions:
-  contents: read
-  id-token: write
-
-jobs:
-  publish-pypi:
-    name: Publish to PyPI
-    if: github.event_name == 'push'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v5
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-
-      - name: Build artifacts
-        run: uv build
-
-      - name: Test wheel installation
-        run: uv run --isolated --no-project --with dist/*.whl python -c "import Deeploy"
-
-      - name: Test sdist installation
-        run: uv run --isolated --no-project --with dist/*.tar.gz python -c "import Deeploy"
-
-      - name: Publish to PyPI
-        run: uv publish
-
-  publish-test-pypi:
-    name: Publish to Test PyPI
-    if: github.event_name == 'workflow_dispatch'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v5
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-
-      - name: Build artifacts
-        run: uv build
-
-      - name: Test wheel installation
-        run: uv run --isolated --no-project --with dist/*.whl python -c "import Deeploy"
-
-      - name: Test sdist installation
-        run: uv run --isolated --no-project --with dist/*.tar.gz python -c "import Deeploy"
-
-      - name: Publish to Test PyPI
-        run: uv publish --publish-url https://test.pypi.org/legacy/

From 6936b7b588bca3d570cb14ad15100e502fc0758d Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Fri, 6 Feb 2026 10:22:53 +0100
Subject: [PATCH 07/10] Remove NVIDIA pyindex references

---
 .github/workflows/ci-lint.yml                      | 2 +-
 .github/workflows/infra-generate-documentation.yml | 2 +-
 docs/tutorials/introduction.md                     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml
index 75163aafaf..5493758672 100644
--- a/.github/workflows/ci-lint.yml
+++ b/.github/workflows/ci-lint.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Build Deeploy
         shell: bash
         run: |
-          pip install . --extra-index-url=https://pypi.ngc.nvidia.com
+          pip install .
           pip install -r requirements-dev.txt
       - name: Format Python
         shell: bash
diff --git a/.github/workflows/infra-generate-documentation.yml b/.github/workflows/infra-generate-documentation.yml
index 8b0ae2b5ce..de42648f63 100644
--- a/.github/workflows/infra-generate-documentation.yml
+++ b/.github/workflows/infra-generate-documentation.yml
@@ -25,7 +25,7 @@ jobs:
       - uses: actions/setup-python@v5
       - name: Install dependencies
         run: |
-          pip install . --extra-index-url=https://pypi.ngc.nvidia.com
+          pip install .
           pip install -r requirements-dev.txt
       - name: Sphinx build
         run: |
diff --git a/docs/tutorials/introduction.md b/docs/tutorials/introduction.md
index 8d7bf69393..490ba2c6aa 100644
--- a/docs/tutorials/introduction.md
+++ b/docs/tutorials/introduction.md
@@ -31,7 +31,7 @@ docker run -it --name deeploy_main -v $(pwd):/app/Deeploy ghcr.io/pulp-platform/
 Install Deeploy inside the container:
 ```
 cd Deeploy
-pip install -e . --extra-index-url=https://pypi.ngc.nvidia.com
+pip install -e .
 ```
 
 From the `DeeployTest` folder, you can use the `deeployRunner` to compile ONNXs and execute the output code using the appropriate simulators.

From 77d15aab1bb1663214c56550a1c7a3af474d9d8d Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Fri, 6 Feb 2026 10:40:34 +0100
Subject: [PATCH 08/10] Add dual tag deeploy docker publishing

---
 .github/workflows/docker-build-deeploy.yml | 35 ++++++++++++++++++++--
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker-build-deeploy.yml b/.github/workflows/docker-build-deeploy.yml
index 9edb90f103..2755c12954 100644
--- a/.github/workflows/docker-build-deeploy.yml
+++ b/.github/workflows/docker-build-deeploy.yml
@@ -19,8 +19,11 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       docker_tag: ${{ steps.generate_tag.outputs.docker_tag }}
+      docker_release_tag: ${{ steps.detect_release_tag.outputs.release_tag }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - name: Set up environment variables
         run: |
@@ -37,6 +40,16 @@ jobs:
             echo "docker_tag=${{ env.BRANCH_NAME }}" >> $GITHUB_OUTPUT
           fi
 
+      - name: Detect release tag on HEAD
+        id: detect_release_tag
+        run: |
+          TAG_ON_HEAD=$(git tag --points-at HEAD | head -n 1 || true)
+          if [[ -n "$TAG_ON_HEAD" && "$IS_TAG" != "tag" ]]; then
+            echo "release_tag=$TAG_ON_HEAD" >> $GITHUB_OUTPUT
+          else
+            echo "release_tag=" >> $GITHUB_OUTPUT
+          fi
+
   build-deeploy:
     name: Build Deploy Image
     needs: [prepare]
@@ -132,13 +145,29 @@ jobs:
         env:
           OWNER: "${{ github.repository_owner }}"
 
+      - name: Prepare manifest tags
+        id: manifest_tags
+        run: |
+          OWNER_LC_VALUE=${OWNER,,}
+          {
+            echo "tags<<'EOF'"
+            echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:latest,"
+            echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:${DOCKER_TAG},"
+            if [[ -n "${RELEASE_TAG}" ]]; then
+              echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:${RELEASE_TAG},"
+            fi
+            echo "EOF"
+          } >> $GITHUB_OUTPUT
+        env:
+          OWNER: ${{ github.repository_owner }}
+          DOCKER_TAG: ${{ needs.prepare.outputs.docker_tag }}
+          RELEASE_TAG: ${{ needs.prepare.outputs.docker_release_tag }}
+
       - name: Merge Deeploy Images
         uses: Noelware/docker-manifest-action@v1
         with:
           inputs: |
             ghcr.io/${{ env.OWNER_LC }}/deeploy@${{ needs.build-deeploy.outputs.digest-amd64 }},
             ghcr.io/${{ env.OWNER_LC }}/deeploy@${{ needs.build-deeploy.outputs.digest-arm64 }}
-          tags: |
-            ghcr.io/${{ env.OWNER_LC }}/deeploy:latest,
-            ghcr.io/${{ env.OWNER_LC }}/deeploy:${{ needs.prepare.outputs.docker_tag }}
+          tags: ${{ steps.manifest_tags.outputs.tags }}
           push: true

From a56d4679b4cc416d6e26a4e6433a4dbdf3b4c0af Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Fri, 6 Feb 2026 11:28:48 +0100
Subject: [PATCH 09/10] Fix prepare manifest tags

---
 .github/workflows/docker-build-deeploy.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker-build-deeploy.yml b/.github/workflows/docker-build-deeploy.yml
index 2755c12954..a4d0221e1f 100644
--- a/.github/workflows/docker-build-deeploy.yml
+++ b/.github/workflows/docker-build-deeploy.yml
@@ -150,11 +150,13 @@ jobs:
         run: |
           OWNER_LC_VALUE=${OWNER,,}
           {
-            echo "tags<<'EOF'"
+            echo "tags<<EOF"
             echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:latest,"
-            echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:${DOCKER_TAG},"
             if [[ -n "${RELEASE_TAG}" ]]; then
-              echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:${RELEASE_TAG},"
+              echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:${DOCKER_TAG},"
+              echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:${RELEASE_TAG}"
+            else
+              echo "ghcr.io/${OWNER_LC_VALUE}/deeploy:${DOCKER_TAG}"
             fi
             echo "EOF"
           } >> $GITHUB_OUTPUT

From 44956374207557ca41b68d33955efd7571eeee71 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Fri, 6 Feb 2026 12:00:09 +0100
Subject: [PATCH 10/10] Format

---
 .github/workflows/package-publish.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/package-publish.yml b/.github/workflows/package-publish.yml
index 1584e89f42..2bce85336b 100644
--- a/.github/workflows/package-publish.yml
+++ b/.github/workflows/package-publish.yml
@@ -85,4 +85,3 @@ jobs:
 
       - name: Publish to Test PyPI
         run: uv publish dist/* --publish-url https://test.pypi.org/legacy/
-