[refactoring/test] Refactor some code and add test cases

bowang007 · bowang007 · commit 538137521c6d · 2025-02-20T03:56:27.000Z
This commit refators some code which fixes some bugs we had previously.
Test cases are added.
diff --git a/py/torch_tensorrt/dynamo/conversion/plugins/_generate_plugin.py b/py/torch_tensorrt/dynamo/conversion/plugins/_generate_plugin.py
@@ -33,11 +33,8 @@ def generate_plugin(plugin_name: str):
     # helper function that generates the required signature based on the torch operation
     def generate_signature(torch_op):
         schema = torch_op._schemas[""]
-        tensor_args = []
-        arg_list = []
 
-        args = []
-        kwargs = []
+        arg_list = []
 
         register_func_annotation = {}
         impl_func_annotation = {}
@@ -56,7 +53,6 @@ def generate_signature(torch_op):
             # - torch._C.ClassType
 
             if arg.type.isSubtypeOf(torch._C.TensorType.get()):
-                tensor_args.append(arg)
                 register_func_annotation[arg.name] = trtp.TensorDesc
                 impl_func_annotation[arg.name] = trtp.Tensor
             elif arg.type.isSubtypeOf(torch._C.FloatType.get()):
@@ -74,40 +70,32 @@ def generate_signature(torch_op):
             else:
                 raise ValueError("arg type is not handled")
 
-            if arg.default_value is None:
-                args.append(arg.name)
-            else:
-                kwargs.append(f"{arg.name} = {arg.default_value}")
-
         input_signature = ", ".join(arg_list)
+
         plugin_signature = f"def add_plugin_desc({input_signature}):"
-        args_input = ", ".join(args)
-        kwargs_input = ", ".join(kwargs)
 
         plugin_impl_arg_list = arg_list
         plugin_impl_arg_list.append("outputs")
         plugin_impl_arg_list.append("stream")
         plugin_impl_input = ", ".join(plugin_impl_arg_list)
-        plugin_impl_signagture = f"def add_plugin_impl({plugin_impl_input}):"
+        plugin_impl_signature = f"def add_plugin_impl({plugin_impl_input}):"
 
         register_func_annotation["return"] = Tuple[trtp.TensorDesc]
 
         impl_func_annotation["outputs"] = Tuple[trtp.Tensor]
         impl_func_annotation["stream"] = int
 
         return (
-            args_input,
-            kwargs_input,
+            input_signature,
             plugin_signature,
-            plugin_impl_signagture,
+            plugin_impl_signature,
             register_func_annotation,
             impl_func_annotation,
         )
 
     # Use the helper function to get the required signatures
     (
-        args_input,
-        kwargs_input,
+        input_signature,
         plugin_signature,
         plugin_impl_signature,
         register_func_annotation,
@@ -118,8 +106,11 @@ def _generic_plugin_desc(*args, **kwargs) -> Tuple[trtp.TensorDesc]:
         shape_env = ShapeEnv()
         fake_mode = FakeTensorMode(shape_env=shape_env)
         syms_args = []
-        for arg in args:
-            sample = {f"{i}": 5 for i in range(arg.ndim)}
+        tensor_args = [elem for elem in args if isinstance(elem, trtp.TensorDesc)]
+
+        for tensor_arg in tensor_args:
+
+            sample = {f"{i}": 5 for i in range(tensor_arg.ndim)}
             syms_arg = [
                 mksym(shape_env, v, LocalSource(k), DimDynamic.DYNAMIC)
                 for k, v in sample.items()
@@ -142,16 +133,16 @@ def _generic_plugin_desc(*args, **kwargs) -> Tuple[trtp.TensorDesc]:
                 tuple(input_node_expr), output.shape[i].node.expr, "math"
             )
 
-        out_desc = args[0].like()
+        out_desc = tensor_args[0].like()
         for i in range(out_desc.ndim):
-            input_shape_expr = [arg.shape_expr[i] for arg in args]
+            input_shape_expr = [tensor_arg.shape_expr[i] for tensor_arg in tensor_args]
             out_desc.shape_expr[i] = shape_calc_fns[i](*input_shape_expr)
 
         return (out_desc,)
 
     codegen_plugin = f"""
 {plugin_signature}
-    return _generic_plugin_desc({args_input}, {kwargs_input})
+    return _generic_plugin_desc({input_signature})
     """
 
     _LOGGER.warning(f"Plugin registration function: \n{codegen_plugin}")
@@ -160,26 +151,35 @@ def _generic_plugin_desc(*args, **kwargs) -> Tuple[trtp.TensorDesc]:
 
     globals()["_generic_plugin_desc"] = _generic_plugin_desc
 
-    plugin = FunctionType(plugin_code.co_consts[0], globals(), "plugin")
+    plugin = FunctionType(
+        plugin_code.co_consts[0],
+        globals(),
+        "plugin",
+    )
 
     # Function annotation is required for dynamic function to work in TensorRT.Plugin
     plugin.__annotations__ = register_func_annotation
 
     trtp.register(plugin_name)(plugin)
 
     def _generic_plugin_impl(outputs, stream, *args, **kwargs):
-        in_tensors = [torch.as_tensor(i, device="cuda") for i in args]
+        tensor_args = [elem for elem in args if isinstance(elem, trtp.Tensor)]
+        print(args)
+        non_tensor_args = [elem for elem in args if not isinstance(elem, trtp.Tensor)]
+        in_tensors = [torch.as_tensor(i, device="cuda") for i in tensor_args]
 
         dest_tensors = [torch.as_tensor(o, device="cuda") for o in outputs]
 
         stream = torch.cuda.ExternalStream(stream)
         with torch.cuda.stream(stream):
-            out_tensors = torch_op(*in_tensors, **kwargs)
+            out_tensors = torch_op(*in_tensors, *non_tensor_args, **kwargs)
+            if isinstance(out_tensors, torch.Tensor):
+                out_tensors = (out_tensors,)
             [d.copy_(o) for (d, o) in zip(dest_tensors, out_tensors)]
 
     plugin_impl_func = f"""
 {plugin_impl_signature}
-    _generic_plugin_impl(outputs, stream, {args_input}, {kwargs_input})
+    _generic_plugin_impl(outputs, stream, {input_signature})
     """
 
     _LOGGER.warning(f"Plugin implementation function: \n{plugin_impl_func}")
@@ -193,5 +193,3 @@ def _generic_plugin_impl(outputs, stream, *args, **kwargs):
     plugin_impl.__annotations__ = impl_func_annotation
 
     trtp.impl(plugin_name)(plugin_impl)
-
-    return plugin
diff --git a/py/torch_tensorrt/dynamo/conversion/plugins/_generate_plugin_converter.py b/py/torch_tensorrt/dynamo/conversion/plugins/_generate_plugin_converter.py
@@ -2,6 +2,7 @@
 from typing import Callable, Dict, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import tensorrt as trt
 
 # Seems like a bug in TensorRT
 import tensorrt_bindings.plugin as trtp
@@ -18,8 +19,6 @@
 )
 from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor
 
-import tensorrt as trt
-
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
 
@@ -58,13 +57,25 @@ def custom_kernel_converter(
         # Assuming TensorRT preserves kwargs order like PyTorch does
         non_tensor_inputs = plugin.input_attrs
 
+        kwargs = {}
+
+        for arg in torch_schema.arguments:
+            if arg.default_value is not None:
+                kwargs[arg.name] = arg.default_value
+
         non_tensor_args = args[len(tensor_inputs) :]
         non_tensor_kwargs = dict(zip(list(non_tensor_inputs.keys()), non_tensor_args))
-        for k, v in non_tensor_kwargs.items():
+
+        for k, v in kwargs.items():
+            if k in non_tensor_kwargs:
+                kwargs[k] = non_tensor_kwargs[k]
+
+        for k, v in kwargs.items():
             if isinstance(v, torch.fx.immutable_collections.immutable_list):
-                non_tensor_kwargs[k] = np.array(v)
+                kwargs[k] = np.array(v)
+
 
-        layer = ctx.net.add_plugin(plugin(*itensor_args, **non_tensor_kwargs))
+        layer = ctx.net.add_plugin(plugin(*itensor_args, **kwargs))
         assert layer, f"{namespace}::{name} plugin layer was not able to be created"
         _LOGGER.debug(
             f"Adding generated plugin for {namespace}::{name} to tensorrt network"
@@ -91,7 +102,7 @@ def generate_plugin_converter(
     supports_dynamic_shapes: bool = False,
 ) -> DynamoConverterImplSignature:
     plugin_ns, plugin_name = plugin_id.split("::")
-    return _generate_plugin_converter(
+        return _generate_plugin_converter(
         plugin_ns,
         plugin_name,
         capability_validator=capability_validator,
diff --git a/tests/py/dynamo/conversion/test_automatic_plugin.py b/tests/py/dynamo/conversion/test_automatic_plugin.py
@@ -0,0 +1,91 @@
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch_tensorrt
+import triton
+import triton.language as tl
+from parameterized import parameterized
+from torch.testing._internal.common_utils import run_tests
+
+from .harness import DispatchTestCase
+
+
+@triton.jit
+def elementwise_mul_kernel(X, Y, Z, BLOCK_SIZE: tl.constexpr):
+    # Program ID determines the block of data each thread will process
+    pid = tl.program_id(0)
+    # Compute the range of elements that this thread block will work on
+    block_start = pid * BLOCK_SIZE
+    # Range of indices this thread will handle
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Load elements from the X and Y tensors
+    x_vals = tl.load(X + offsets)
+    y_vals = tl.load(Y + offsets)
+    # Perform the element-wise multiplication
+    z_vals = x_vals * y_vals
+    # Store the result in Z
+    tl.store(Z + offsets, z_vals)
+
+
+@torch.library.custom_op("torchtrt_ex::elementwise_mul", mutates_args=())  # type: ignore[misc]
+def elementwise_mul(X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
+    # Ensure the tensors are on the GPU
+    assert X.is_cuda and Y.is_cuda, "Tensors must be on CUDA device."
+    assert X.shape == Y.shape, "Tensors must have the same shape."
+
+    # Create output tensor
+    Z = torch.empty_like(X)
+
+    # Define block size
+    BLOCK_SIZE = 1024
+
+    # Grid of programs
+    grid = lambda meta: (X.numel() // meta["BLOCK_SIZE"],)
+
+    # Launch the kernel
+    elementwise_mul_kernel[grid](X, Y, Z, BLOCK_SIZE=BLOCK_SIZE)
+
+    return Z
+
+
+@torch.library.register_fake("torchtrt_ex::elementwise_mul")
+def elementwise_mul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    return x
+
+
+torch_tensorrt.dynamo.conversion.plugins.custom_op(
+    "torchtrt_ex::elementwise_mul", supports_dynamic_shapes=True
+)
+
+
+class TestAutomaticPlugin(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ((64, 64), torch.float),
+        ]
+    )
+    def test_mul_plugin_float(self, input_shape, dtype):
+        class elementwise_mul(nn.Module):
+            def forward(self, lhs, rhs):
+                return torch.ops.torchtrt_ex.elementwise_mul.default(lhs, rhs)
+
+        inputs = [
+            torch.randint(0, 5, input_shape, device="cuda", dtype=dtype),
+            torch.randint(0, 5, input_shape, device="cuda", dtype=dtype),
+        ]
+
+        self.run_test(elementwise_mul(), inputs)
+
+
+if __name__ == "__main__":
+    run_tests()
+
+# Example Usage
+# A = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
+# B = torch.full((64, 64), 3, device="cuda", dtype=torch.float)
+
+# C, D = torch.ops.torchtrt_ex.elementwise_add_mul.default(A, B)
+
+# print("C (Addition):", C)
+# print("D (Multiplication):", D)
diff --git a/tests/py/dynamo/conversion/test_automatic_plugin_with_attrs.py b/tests/py/dynamo/conversion/test_automatic_plugin_with_attrs.py
@@ -0,0 +1,85 @@
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch_tensorrt
+import triton
+import triton.language as tl
+from parameterized import parameterized
+from torch.testing._internal.common_utils import run_tests
+
+from .harness import DispatchTestCase
+
+
+@triton.jit
+def elementwise_scale_mul_kernel(X, Y, Z, a, b, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    # Compute the range of elements that this thread block will work on
+    block_start = pid * BLOCK_SIZE
+    # Range of indices this thread will handle
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Load elements from the X and Y tensors
+    x_vals = tl.load(X + offsets)
+    y_vals = tl.load(Y + offsets)
+    # Perform the element-wise multiplication
+    z_vals = x_vals * y_vals * a + b
+    # Store the result in Z
+    tl.store(Z + offsets, z_vals)
+
+
+@torch.library.custom_op("torchtrt_ex::elementwise_scale_mul", mutates_args=())  # type: ignore[misc]
+def elementwise_scale_mul(
+    X: torch.Tensor, Y: torch.Tensor, b: float = 0.2, a: int = 2
+) -> torch.Tensor:
+    # Ensure the tensors are on the GPU
+    assert X.is_cuda and Y.is_cuda, "Tensors must be on CUDA device."
+    assert X.shape == Y.shape, "Tensors must have the same shape."
+
+    # Create output tensor
+    Z = torch.empty_like(X)
+
+    # Define block size
+    BLOCK_SIZE = 1024
+
+    # Grid of programs
+    grid = lambda meta: (X.numel() // meta["BLOCK_SIZE"],)
+
+    # Launch the kernel with parameters a and b
+    elementwise_scale_mul_kernel[grid](X, Y, Z, a, b, BLOCK_SIZE=BLOCK_SIZE)
+
+    return Z
+
+
+@torch.library.register_fake("torchtrt_ex::elementwise_scale_mul")
+def _(x: torch.Tensor, y: torch.Tensor, b: float = 0.2, a: int = 2) -> torch.Tensor:
+    return x
+
+
+torch_tensorrt.dynamo.conversion.plugins.custom_op(
+    "torchtrt_ex::elementwise_scale_mul", supports_dynamic_shapes=True
+)
+
+
+class TestAutomaticPlugin(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ((64, 64), torch.float),
+        ]
+    )
+    def test_scale_mul_plugin_float(self, input_shape, dtype):
+        class elementwise_scale_mul(nn.Module):
+            def forward(self, lhs, rhs):
+                return torch.ops.torchtrt_ex.elementwise_scale_mul.default(
+                    lhs, rhs, b=1, a=0
+                )
+
+        inputs = [
+            torch.randint(0, 5, input_shape, device="cuda", dtype=dtype),
+            torch.randint(0, 5, input_shape, device="cuda", dtype=dtype),
+        ]
+
+        self.run_test(elementwise_scale_mul(), inputs)
+
+
+if __name__ == "__main__":
+    run_tests()