resolve comments

zewenli98 · zewenli98 · commit eed420a023ac · 2025-03-03T20:04:25.000-08:00
diff --git a/py/torch_tensorrt/dynamo/_engine_cache.py b/py/torch_tensorrt/dynamo/_engine_cache.py
@@ -118,7 +118,7 @@ def pack(
             input_specs (Sequence[Input]): input specs of TRT engine
             compilation_settings (CompilationSettings): compilation settings of TRT engine
             weight_name_map (Optional[Dict[Any, Any]]): weight name map for refitting
-            requires_output_allocator (bool): whether the engine requires output allocator
+            requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators)
         Returns:
             bytes: packed blob
         """
diff --git a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py
@@ -11,7 +11,7 @@ class ConversionContext:
     Args:
         net: TensorRT Network being built
         compilation_settings: Settings selected by the user for compilation
-        requires_output_allocator: Whether the network requires output allocator
+        requires_output_allocator: Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators)
     """
 
     net: TRTNetwork
diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py
@@ -80,7 +80,7 @@ class ConverterSupport:
             whether that node can be supported by its companion converter. Note that
             this function must not modify the node or its graph
         supports_dynamic_shapes: Boolean flag indicating if the converter has support for dynamic inputs.
-        requires_output_allocator: Boolean flag indicating if the converter requires to run in output allocator.
+        requires_output_allocator: Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators).
     """
 
     converter_implementation: ConverterImplSignature
@@ -215,7 +215,7 @@ def dynamo_tensorrt_converter(
         priority: Converter's level of priority relative to other converters with the
             same target
         supports_dynamic_shapes: Boolean flag indicating if the converter has support for dynamic shapes.
-        requires_output_allocator: Boolean flag indicating if the converter requires to run in output allocator.
+        requires_output_allocator: Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators).
     Returns:
         The converter being decorated
     """
@@ -410,7 +410,7 @@ def __getitem_without_validation__(
     def __getitem__(
         self, node: Node
     ) -> Tuple[
-        Any, CallingConvention, bool
+        Any, CallingConvention, Dict[str, bool]
     ]:  # TODO: Narrow to ConverterImplSignature this when we can remove FX converters
         """Get the first-found validated converter in any registry
 
@@ -468,7 +468,10 @@ def __getitem__(
                             return (
                                 candidate.converter_implementation,
                                 calling_convention,
-                                candidate.requires_output_allocator,
+                                {
+                                    "supports_dynamic_shapes": candidate.supports_dynamic_shapes,
+                                    "requires_output_allocator": candidate.requires_output_allocator,
+                                },
                             )
                         else:
                             logger.debug(
@@ -481,7 +484,10 @@ def __getitem__(
                         return (
                             converters,
                             calling_convention,
-                            False,
+                            {
+                                "supports_dynamic_shapes": False,
+                                "requires_output_allocator": False,
+                            },
                         )
 
         raise KeyError(
@@ -506,7 +512,7 @@ def get_unvalidated(
     def get(
         self, node: Node, value: Optional[ConverterImplSignature] = None
     ) -> Union[
-        Any, Tuple[Any, CallingConvention, bool]
+        Any, Tuple[Any, CallingConvention, Dict[str, bool]]
     ]:  # TODO: Narrow to ConverterImplSignature this when we can remove FX converters
         """Get validated converter for input node with a default return"""
         try:
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -835,7 +835,7 @@ def call_module(
                 f"Conversion of module of type {submod_type} not currently supported!"
             )
 
-        converter, calling_convention, requires_output_allocator = converter_packet
+        converter, calling_convention, _ = converter_packet
 
         assert self._cur_node_name is not None
 
@@ -852,8 +852,8 @@ def call_function(self, target: str, args: Any, kwargs: Any) -> Any:
                 f"Conversion of function {torch.typename(target)} not currently supported!"
             )
 
-        converter, calling_convention, requires_output_allocator = converter_packet
-        if requires_output_allocator:
+        converter, calling_convention, converter_info = converter_packet
+        if converter_info.get("requires_output_allocator", False):
             self.ctx.requires_output_allocator = True
             _LOGGER.debug(f"{target} requires output allocator")
 
@@ -885,7 +885,7 @@ def call_method(self, target: str, args: Any, kwargs: Any) -> Any:
             raise UnsupportedOperatorException(
                 f"Conversion of method {target} not currently supported!"
             )
-        converter, calling_convention, requires_output_allocator = converter_packet
+        converter, calling_convention, _ = converter_packet
 
         if calling_convention is CallingConvention.LEGACY:
             return converter(self.ctx.net, target, args, kwargs, self._cur_node_name)
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_num_users_is_0_nodes.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_num_users_is_0_nodes.py
@@ -13,16 +13,15 @@ def remove_num_users_is_0_nodes(
     gm: torch.fx.GraphModule, settings: CompilationSettings
 ) -> torch.fx.GraphModule:
     """Remove ops that [num_users=0] in the graph"""
-    output_node = list(gm.graph.nodes)[-1]
+    nodes = list(gm.graph.nodes)
+    output_node = nodes[-1]
 
-    for node in gm.graph.nodes:
+    for node in nodes[::-1]:
         if (
             node != output_node
             and len(node.users) == 0
             and len(node.all_input_nodes) > 0
         ):
-            node_input = node.all_input_nodes[0]
-            node.replace_all_uses_with(node_input)
             gm.graph.erase_node(node)
             gm = clean_up_graph_after_modifications(gm)
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -141,7 +141,7 @@ def __init__(
             name (str): Name for module
             settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed
             weight_name_map (dict): Mapping of engine weight name to state_dict weight name
-            requires_output_allocator (bool): Whether the engine requires an output allocator
+            requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators)
 
         Example:
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -98,7 +98,7 @@ def __init__(
             name (str): Name for module
             settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed
             weight_name_map (dict): Mapping of engine weight name to state_dict weight name
-            requires_output_allocator (bool): Whether the engine requires an output allocator
+            requires_output_allocator (bool): Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators)
 
         Example:
 
diff --git a/py/torch_tensorrt/runtime/_cudagraphs.py b/py/torch_tensorrt/runtime/_cudagraphs.py
@@ -87,17 +87,22 @@ def __enter__(self) -> torch.nn.Module:
             elif "_run_on_gpu" in name:
                 num_torch_module += 1
 
-        if num_torch_module > 0 and not disable_cudagraphs:
-            # Set whole cudagraphs mode and returns wrapped module
-            _PY_RT_CUDAGRAPHS = CudaGraphsMode.WHOLE_GRAPH_CUDAGRAPHS
-            # Set new mode for C++
-            if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime:
-                torch.ops.tensorrt.set_cudagraphs_mode(_PY_RT_CUDAGRAPHS)
-
-            logger.debug(
-                "Found pytorch subgraphs in module, wrapping module in CudaGraphsTorchTensorRTModule"
-            )
-            return CudaGraphsTorchTensorRTModule(self.compiled_module)
+        if num_torch_module > 0:
+            if disable_cudagraphs:
+                raise RuntimeError(
+                    "There are converters that require Output Allocator. Please disable CUDA Graphs."
+                )
+            else:
+                # Set whole cudagraphs mode and returns wrapped module
+                _PY_RT_CUDAGRAPHS = CudaGraphsMode.WHOLE_GRAPH_CUDAGRAPHS
+                # Set new mode for C++
+                if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime:
+                    torch.ops.tensorrt.set_cudagraphs_mode(_PY_RT_CUDAGRAPHS)
+
+                logger.debug(
+                    "Found pytorch subgraphs in module, wrapping module in CudaGraphsTorchTensorRTModule"
+                )
+                return CudaGraphsTorchTensorRTModule(self.compiled_module)
         else:
             if num_trt_module > 0:
                 logger.debug("No graph breaks detected, using runtime cudagraphs mode")