add doc

zewenli98 · zewenli98 · commit b3367d56b978 · 2025-03-12T16:05:57.000-07:00
diff --git a/docs/_sources/py_api/runtime.rst.txt b/docs/_sources/py_api/runtime.rst.txt
@@ -19,7 +19,7 @@ Functions
 
 .. autofunction:: get_whole_cudagraphs_mode
 
-.. autofunction:: set_cudagraphs_modue
+.. autofunction:: set_cudagraphs_mode
 
 .. autofunction:: enable_pre_allocated_outputs
 
diff --git a/docsrc/py_api/runtime.rst b/docsrc/py_api/runtime.rst
@@ -19,12 +19,14 @@ Functions
 
 .. autofunction:: get_whole_cudagraphs_mode
 
-.. autofunction:: set_cudagraphs_modue
+.. autofunction:: set_cudagraphs_mode
 
 .. autofunction:: enable_pre_allocated_outputs
 
 .. autofunction:: weight_streaming
 
+.. autofunction:: enable_output_allocator
+
 Classes
 ---------
 
diff --git a/docsrc/user_guide/runtime.rst b/docsrc/user_guide/runtime.rst
@@ -92,3 +92,41 @@ Cudagraphs can accelerate certain models by reducing kernel overheads, as docume
 In the current implementation, use of a new input shape (for instance in dynamic shape 
 cases), will cause the cudagraph to be re-recorded. Cudagraph recording is generally 
 not latency intensive, and future improvements include caching cudagraphs for multiple input shapes.
+
+Dynamic Output Allocation Mode
+------------------------------
+
+Dynamic output allocation is a feature in Torch-TensorRT which allows the output buffer of TensorRT engines to be
+dynamically allocated. This is useful for models with dynamic output shapes, especially ops with data-dependent shapes. 
+Without dynamic output allocation, the output buffer is statically allocated and the size is the maximum possible size 
+required by the op. This can lead to inefficient memory usage if the actual output size is smaller than the maximum possible size.
+
+There are two scenarios in which dynamic output allocation is enabled:
+
+1. When the model contains submodules that require a dynamic output allocator at runtime, users don't have to manually enable dynamic output allocation mode.
+
+To specify if a module requires a dynamic output allocator, users can set the ``requires_output_allocator=True`` flag in the ``@dynamo_tensorrt_converter`` decorator of converters. e.g.,
+
+.. code-block:: python
+
+    @dynamo_tensorrt_converter(
+        torch.ops.aten.nonzero.default,
+        supports_dynamic_shapes=True,
+        requires_output_allocator=True,
+    )
+    def aten_ops_nonzero(
+        ctx: ConversionContext,
+        target: Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        name: str,
+    ) -> Union[TRTTensor, Sequence[TRTTensor]]:
+        ...
+
+2. When users manually enable dynamic output allocation via the ``torch_tensorrt.runtime.enable_output_allocator`` context manager.
+
+.. code-block:: python
+
+    # Enables Dynamic Output Allocation Mode, then resets the mode to its prior setting
+    with torch_tensorrt.runtime.enable_output_allocator(trt_module):
+        ...
diff --git a/examples/dynamo/converter_overloading.py b/examples/dynamo/converter_overloading.py
@@ -58,12 +58,11 @@ def forward(self, x):
 
 from typing import Dict, Sequence, Tuple, Union
 
+import tensorrt as trt
 from torch.fx.node import Argument, Node, Target
 from torch_tensorrt.dynamo import CompilationSettings
 from torch_tensorrt.dynamo.conversion import ConversionContext
 
-import tensorrt as trt
-
 # %%
 # Converter Metadata
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -80,6 +79,8 @@ def forward(self, x):
     supports_dynamic_shapes=True,
     # Set the priority of the converter to supersede the default one
     priority=torch_tensorrt.dynamo.conversion.ConverterPriority.HIGH,
+    # Whether the converter requires a dynamic output allocator to run (e.g. data dependent ops)
+    requires_output_allocator=True,
 )
 
 # %%
@@ -98,7 +99,7 @@ def forward(self, x):
 #
 # Finally there is the ``priority`` argument, which is an enum from the ``torch_tensorrt.dynamo.conversion.ConverterPriority`` class that defines the priority of the converter. The two options are ``HIGH`` and ``STANDARD``.
 # Converters registered with ``STANDARD`` will be appended to the converter list for a given operation, while converters registered with ``HIGH`` will be prepended to the list.
-# Candidate converters are evalated for their suitablity in this priority order and the first converter that passes the validator is used.
+# Candidate converters are evalated for their suitability in this priority order and the first converter that passes the validator is used.
 
 
 # %%