Fixed the comments. Added more tests

cehongwang · cehongwang · commit 18425fe7b97a · 2025-12-01T16:57:45.000Z
diff --git a/examples/dynamo/low_cpu_memory_compilation.py b/examples/dynamo/low_cpu_memory_compilation.py
@@ -86,25 +86,44 @@ def forward(self, x):
 
 """
 You should be able to see two back-to-back TensorRT engines in the graph
+
 Graph Structure:
 
    Inputs: List[Tensor: (1, 1024, 224, 224)@float32]
     ...
-    TRT Engine #1 - Submodule name: _run_on_acc_0
+    TRT Engine #1 - Submodule name: _run_on_acc_0_resource_split_0
      Engine Inputs: List[Tensor: (1, 1024, 224, 224)@float32]
      Number of Operators in Engine: 9
      Engine Outputs: List[Tensor: (1, 1024, 112, 112)@float32]
     ...
-    TRT Engine #2 - Submodule name: _run_on_acc_1
+    TRT Engine #2 - Submodule name: _run_on_acc_0_resource_split_1
      Engine Inputs: List[Tensor: (1, 1024, 112, 112)@float32]
      Number of Operators in Engine: 3
      Engine Outputs: List[Tensor: (1, 10)@float32]
     ...
    Outputs: List[Tensor: (1, 10)@float32]
 
+  ------------------------- Aggregate Stats -------------------------
+
+   Average Number of Operators per TRT Engine: 6.0
+   Most Operators in a TRT Engine: 9
 
+  ********** Recommendations **********
+
+   - For minimal graph segmentation, select min_block_size=9 which would generate 1 TRT engine(s)
+   - For moderate graph segmentation, select min_block_size=6 which would generate 1 TRT engine(s)
+   - The current level of graph segmentation is equivalent to selecting min_block_size=3 which generates 2 TRT engine(s)
 GraphModule(
-  (_run_on_acc_0): TorchTensorRTModule()
-  (_run_on_acc_1): TorchTensorRTModule()
+  (_run_on_acc_0_resource_split_0): TorchTensorRTModule()
+  (_run_on_acc_0_resource_split_1): TorchTensorRTModule()
+)
+
+
+
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _run_on_acc_0_resource_split_0 = self._run_on_acc_0_resource_split_0(x);  x = None
+    _run_on_acc_0_resource_split_1 = self._run_on_acc_0_resource_split_1(_run_on_acc_0_resource_split_0);  _run_on_acc_0_resource_split_0 = None
+    return pytree.tree_unflatten((_run_on_acc_0_resource_split_1,), self._out_spec)
 )
 """
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -622,10 +622,6 @@ def compile(
             "'arg_inputs' and 'inputs' should not be used at the same time."
         )
 
-    assert (
-        cpu_memory_budget >= 2 * 1024 * 1024 * 1024
-    ), "CPU memory budget must be greater than 10GB"
-
     arg_inputs = inputs or arg_inputs
 
     if kwarg_inputs is None:
diff --git a/py/torch_tensorrt/dynamo/partitioning/_atomic_subgraphs.py b/py/torch_tensorrt/dynamo/partitioning/_atomic_subgraphs.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from functools import lru_cache
 from typing import Any, Callable, Dict, List, Set, Tuple
 
@@ -135,7 +136,7 @@ def get_node_in_fusion_pattern(
     Key: node that appears in the fusion pattern
     Value: the list of nodes that should be fused together
     """
-    fusion_nodes = {}
+    fusion_nodes = defaultdict(set)
     for compiled_pattern_graph in get_compiled_atomic_subgraphs():
         subgraph_matcher = SubgraphMatcher(compiled_pattern_graph.graph)
         match_result = subgraph_matcher.match(graph)
@@ -149,7 +150,7 @@ def get_node_in_fusion_pattern(
                 and node not in match.placeholder_nodes
             }
             for node in fusion_group:
-                fusion_nodes[node] = fusion_group
+                fusion_nodes[node].update(fusion_group)
 
     return fusion_nodes
 
diff --git a/py/torch_tensorrt/dynamo/partitioning/_resource_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_resource_partitioner.py
@@ -46,7 +46,7 @@
 """
 
 import logging
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple
 
 import psutil
 import torch
@@ -92,7 +92,7 @@ def __init__(
 
         self._node_submodule_map: Dict[str, str] = {}
         self._return_tuple = False
-        self.fusion_patterns: Dict[torch.fx.Node, List[torch.fx.Node]] = {}
+        self.fusion_patterns: Dict[torch.fx.Node, Set[torch.fx.Node]] = {}
 
     def partition_graph(self) -> torch.fx.GraphModule:
         """Build the final partitioned `GraphModule` honoring memory constraints.
@@ -214,7 +214,6 @@ def break_subgraphs(
         # We throw an error if the remaining memory is almost empty compared to the model size.
         # i.e. if the remaining memory is 4G (budget is 1G) the model size is greater than 40G, we stop the compilation.
         sizes = self.size_of_subgraphs(subgraphs)
-        # subgraph_size_budget = 500*1024*1024
         if sum(sizes) > subgraph_size_budget * 40:
             raise ValueError(
                 "CPU memory budget or available memory is too small to compile the model. "
@@ -470,12 +469,6 @@ def validate_and_correct_subgraphs(
                 visited_nodes[subgraph.nodes[-1]] = i + 1
                 continue
 
-            elif not subgraph.is_acc:
-                # non-accelerated subgraphs should be put in the next subgraph
-                for node in subgraph.nodes:
-                    visited_nodes[subgraph.nodes[-1]] = i + 1
-                continue
-
             else:
                 to_remove_nodes = []
                 for j, node in enumerate(subgraph.nodes):
diff --git a/tests/py/dynamo/partitioning/test_resource_partitioning.py b/tests/py/dynamo/partitioning/test_resource_partitioning.py

Original file line number	Diff line number	Diff line change
`@@ -622,10 +622,6 @@ def compile(`
`622`	`622`	`"'arg_inputs' and 'inputs' should not be used at the same time."`
`623`	`623`	`)`
`624`	`624`
`625`		`- assert (`
`626`		`- cpu_memory_budget >= 2 * 1024 * 1024 * 1024`
`627`		`- ), "CPU memory budget must be greater than 10GB"`
`628`		`-`
`629`	`625`	`arg_inputs = inputs or arg_inputs`
`630`	`626`
`631`	`627`	`if kwarg_inputs is None:`