[inductor][ck] kBatch filtering with gen_ops (pytorch#148004)

coconutruben · pytorchmergebot · commit f0d00421cfc2 · 2025-02-27T20:13:58.000Z
Summary: # Why not all choices of kBatch are valid and will lead to a runtime error (when CK checks the validity of the args) https://github.com/ROCm/composable_kernel/blob/c9bcfd755ed4d2102d76a6f545ac6e9a030d7d8e/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp#L1020 # What - move kBatch inside the gen_ops to have more control over it, and be able to filter it - expand filtering based on the cpp logic - refactor the padding checks to be more readable Test Plan: ``` buck2 run -c fbcode.re_gpu_tests=False mode/opt-amd-gpu fbcode//deeplearning/aot_inductor/benchmark/sampling:test_gemm_autotune_benchmark_AMD_block_0 ``` with kBatch = 128: some filering kBatch = 1: no filering kBatch = 1738: all options filtered out Reviewed By: henrylhtsang Differential Revision: D70211442 Pull Request resolved: pytorch#148004 Approved by: https://github.com/ColinPeppler, https://github.com/tenpercent
diff --git a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
@@ -1,7 +1,9 @@
 # mypy: allow-untyped-defs, disable-error-code="attr-defined, valid-type"
 import copy
 import logging
+import math
 import random
+from collections import namedtuple
 from typing import Optional
 
 import sympy
@@ -22,6 +24,30 @@
 
 log = logging.getLogger(__name__)
 
+# lightweight collection of information about a single op
+InductorROCmOp = namedtuple("InductorROCmOp", ["op", "kBatch"])
+
+padding_lookup = {
+    "M": {
+        "GemmSpecialization::MPadding": True,
+        "GemmSpecialization::MNPadding": True,
+        "GemmSpecialization::MKPadding": True,
+        "GemmSpecialization::MNKPadding": True,
+    },
+    "N": {
+        "GemmSpecialization::NPadding": True,
+        "GemmSpecialization::MNPadding": True,
+        "GemmSpecialization::NKPadding": True,
+        "GemmSpecialization::MNKPadding": True,
+    },
+    "K": {
+        "GemmSpecialization::KPadding": True,
+        "GemmSpecialization::MKPadding": True,
+        "GemmSpecialization::NKPadding": True,
+        "GemmSpecialization::MNKPadding": True,
+    },
+}
+
 
 def is_static_int(number):
     return isinstance(number, (int, sympy.Integer))
@@ -363,7 +389,14 @@ def inline_utils(self):
         )
         return res
 
-    def filter_op(self, op: "CKGemmOperation"):
+    def _has_padding(self, dimension, gemm_specialization):
+        # Get the relevant padding map for the given dimension
+        dimension_padding = padding_lookup.get(dimension, {})
+
+        # Check if the specialization is in the dimension's padding map
+        return dimension_padding.get(gemm_specialization, False)
+
+    def filter_op(self, op_info: InductorROCmOp):
         """
         Determines whether a given op definition is suitable for the current
         input / output of the operation that this template implements.
@@ -372,6 +405,7 @@ def filter_op(self, op: "CKGemmOperation"):
 
         Returns None if the op is not suitable, otherwise returns the op to be used.
         """
+        op, kBatch = op_info.op, op_info.kBatch
         metas = [T.get_layout() for T in [*self.input_nodes, self.output_node]]
         X_meta = metas[0]
         W_meta = metas[1]
@@ -398,26 +432,27 @@ def filter_op(self, op: "CKGemmOperation"):
         N = W_meta.size[-1]
 
         if is_static_int(M):
-            if not any(
-                m_padding in op.gemm_specialization
-                for m_padding in ["MPadding", "MNPadding", "MKPadding", "MNKPadding"]
-            ):
+            if not self._has_padding("M", op.gemm_specialization):
                 if M % op.m_per_block != 0:
                     return None
         if is_static_int(N):
-            if not any(
-                n_padding in op.gemm_specialization
-                for n_padding in ["NPadding", "MNPadding", "NKPadding", "MNKPadding"]
-            ):
+            if not self._has_padding("N", op.gemm_specialization):
                 if N % op.n_per_block != 0:
                     return None
         if is_static_int(K):
-            if not any(
-                k_padding in op.gemm_specialization
-                for k_padding in ["KPadding", "MKPadding", "NKPadding", "MNKPadding"]
-            ):
+            if not self._has_padding("K", op.gemm_specialization):
                 if K % op.k_per_block != 0:
                     return None
+                K_t = kBatch * op.k_per_block
+                if K % K_t != 0:
+                    return None
+            else:
+                # need another kBatch check here
+                lcm = abs(op.a_k1 * op.b_k1) // math.gcd(op.a_k1, op.b_k1)
+                K_t = kBatch * lcm
+                k_read_pad_splited = math.ceil(K / K_t) * lcm
+                if (k_read_pad_splited * (kBatch - 1)) >= K:
+                    return None
 
         a_contig_size = (
             K if op.a_layout == "Row" else M if op.a_layout == "Col" else None
@@ -451,12 +486,83 @@ def filter_op(self, op: "CKGemmOperation"):
             != 0
         ):
             return None
-
+        if not self._check_num_k_loops(op, kBatch):
+            return None
         # TBD disable instances with invalid number of pipeline prefetch stages
         # It will avoid compiling a small percentage of unrunnable instances which fail the gemm argument check
 
         return op
 
+    def _check_num_k_loops(self, op, kBatch):
+        # Additional splitK scenario check
+        metas = [T.get_layout() for T in [*self.input_nodes]]
+        X_meta = metas[0]
+        W_meta = metas[1]
+        K = X_meta.size[-1]
+        if kBatch > 1:
+            if op.block_gemm_pipeline_version != "BlockGemmPipelineVersion::v1":
+                try:
+                    prefetch_stages = self._prefetch_stages(
+                        op,
+                        torch.empty((), dtype=X_meta.dtype).element_size(),
+                        torch.empty((), dtype=W_meta.dtype).element_size(),
+                        torch.cuda.get_device_properties(X_meta.device).warp_size,
+                    )
+                except Exception as e:
+                    log.debug(
+                        "Failed to prefetch_stages for %s with exception %s", op.name, e
+                    )
+                    # be conservative here and disable the op
+                    return False
+
+                K_t = op.k_per_block * kBatch
+                ak0 = (K + K_t - 1) // K_t * (op.k_per_block // op.a_k1)
+                num_k_loop = ak0 // (op.k_per_block // op.a_k1)
+                if num_k_loop <= prefetch_stages:
+                    log.debug(
+                        "Op %s is not compatible due to invalid number of pipeline prefetch stages. "
+                        "Parameters: kBatch=%s, block_gemm_pipeline_version=%s, prefetch_stages=%s, num_k_loop=%s",
+                        op.name(),
+                        kBatch,
+                        op.block_gemm_pipeline_version,
+                        prefetch_stages,
+                        num_k_loop,
+                    )
+                    return False
+
+        return True
+
+    # small helper to figure out the prefetch stages on AMD
+    def _prefetch_stages(self, op, a_dtype_size, b_dtype_size, warp_size: int = 64):
+        version_str = op.block_gemm_pipeline_version.split("::")[-1]
+        try:
+            version = int(version_str[1:])  # Assuming the format is always 'vX'
+        except ValueError as e:
+            raise ValueError(f"Invalid version string: {version_str}") from e
+        if version not in [1, 2, 3, 4, 5]:
+            raise ValueError(
+                f"unknown prefetch stages for {op.block_gemm_pipeline_version}"
+            )
+        # Define the mapping of versions to stages
+        version_to_stages = {1: 1, 3: 2, 4: 4, 5: 3}
+        # Get the stages for the given version
+        stages = version_to_stages.get(version, None)
+        if stages is None:
+            # This means we're at stage 2, and this requires computation
+            # See github.com/ROCm/composable_kernel/blob/d6a4605/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp#L143 # noqa: B950
+            wgp_per_cu = max(4 * warp_size // op.block_size, 1)
+            full_mem_band_prefetch_stages = math.ceil(
+                32768
+                / wgp_per_cu
+                / (
+                    (op.m_per_block * a_dtype_size + op.n_per_block * b_dtype_size)
+                    * op.k_per_block
+                )
+            )
+            stages = min(max(full_mem_band_prefetch_stages, 2), 8)
+
+        return stages
+
     def emit_ck_instance(self, op: "CKGemmOperation"):
         # The Jinja template for generating a C++ type alias *definition* for a Universal GEMM instance
         struct_name = (
@@ -765,7 +871,7 @@ def _is_rcr_f16(self):
             and Y_layout == "Row"
         )
 
-    def gen_ops(self):
+    def gen_ops(self) -> list[InductorROCmOp]:
         """
         Creates a list of `CKGemmOperation` instances that match the GEMM operation this template represents.
         The instances are guaranteed to have the correct layout, dtype and dimension padding for the GEMM input arguments.
@@ -794,7 +900,17 @@ def gen_ops(self):
 
         assert generator is not None
 
-        filtered_instances = list(filter(lambda op: self.filter_op(op), generator()))
+        # NOTE(coconutruben): for now, we only support kBatch 1
+        # TODO(coconturuben): infer a better kBatch depending on the input shape
+        # TODO(coconutruben): allow users to provide a list of kBatches to sweep over
+        kBatches = [1]
+        rops = generator()
+        ops = [
+            InductorROCmOp(op=op, kBatch=kBatch) for op in rops for kBatch in kBatches
+        ]
+
+        filtered_instances = list(filter(lambda op: self.filter_op(op), ops))
+
         # NB: when using a fixed list order, most likely we will pick the subset of instances
         # which are very similar to each other. Randomizing the choice seems to solve this.
         random.seed(-11)
@@ -836,8 +952,8 @@ def add_ck_gemm_choices(
         for op in ops:
             template.maybe_append_choice(
                 choices,
-                op=op,
-                kBatch=1,
+                op=op.op,
+                kBatch=op.kBatch,
             )
 
     def size_args(self):