ROCm MX-FP8 Gemm

jagadish-amd · jagadish-amd · commit a264d06d9aab · 2025-04-22T23:17:08.000-07:00
Ported the patch from pytorch#147553 Commented few lines to avoid compilation error. (check for todo comments) Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -1566,6 +1566,25 @@ void scaled_gemm(
     matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
     matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
   }
+    else if(mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
+    #if ROCM_VERSION >= 60500
+          if (at::cuda::tunable::IsGfx950Device()) {
+            // Validate matrix dimensions for MX format
+            TORCH_CHECK(at::cuda::tunable::ValidateMXFormatRequirements(m, n, k),
+                       "Matrix dimensions must be multiples of 32 for MX format. ",
+                       "Got m=", m, ", n=", n, ", k=", k);
+
+           //todo
+            // Set block sizes for MX format
+            // TODO: Check if we need to set these explicitly for hipblaslt
+            //constexpr int32_t block_size = 32;
+            //computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_BLOCK_SIZE_ROWS_VEC_EXT, block_size);
+            //computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_BLOCK_SIZE_COLS_VEC_EXT, block_size);
+            //computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_BLOCK_SIZE_ROWS_VEC_EXT, block_size);
+            //computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_BLOCK_SIZE_COLS_VEC_EXT, block_size);
+          }
+#endif
+  }
 #else
   // rowwise isn't supported using cublaslt or older hipblaslt
   TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
@@ -1603,11 +1622,11 @@ void scaled_gemm(
   }
 
   if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
-#if CUDA_VERSION >= 12080
+#if (!defined(USE_ROCM) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 60500)
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
 #else
-    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
+    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 6.5(with gfx950) and above");
 #endif // if CUDA_VERSION >= 12080
   } else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) {
 #if CUDA_VERSION >= 12080
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -14,6 +14,8 @@
 #include <hipblaslt/hipblaslt.h>
 #include <hipblaslt/hipblaslt-ext.hpp>
 
+#include <ATen/cuda/tunable/GemmMxUtils.h>
+
 #define TORCH_HIPBLASLT_CHECK(EXPR)               \
   do {                                            \
     hipblasStatus_t __err = EXPR;                 \
@@ -513,7 +515,24 @@ class HipblasltGemmOp : public Callable<ParamsT> {
       if (mat1_scale_ptr && mat2_scale_ptr) {
 #ifdef HIPBLASLT_VEC_EXT
         if (GetUseRowwiseFromParams<CT>(params)) {
-          // swapped
+          // For MX-FP8 on gfx950
+#if ROCM_VERSION >= 60500
+          if (IsGfx950Device()) {
+            // Validate matrix dimensions for MX format
+            TORCH_CHECK(ValidateMXFormatRequirements(params->m, params->n, params->k),
+                       "Matrix dimensions must be multiples of 32 for MX format. ",
+                       "Got m=", params->m, ", n=", params->n, ", k=", params->k);
+
+           //todo
+            // Set block sizes for MX format
+            //constexpr int32_t block_size = 32;
+            //matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_BLOCK_SIZE_ROWS_VEC_EXT, block_size);
+            //matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_BLOCK_SIZE_COLS_VEC_EXT, block_size);
+            //matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_BLOCK_SIZE_ROWS_VEC_EXT, block_size);
+            //matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_BLOCK_SIZE_COLS_VEC_EXT, block_size);
+          }
+#endif
+          // Set scale pointers (swapped as before)
           matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT, mat2_scale_ptr);
           matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT, mat1_scale_ptr);
         }
diff --git a/aten/src/ATen/cuda/tunable/GemmMxUtils.h b/aten/src/ATen/cuda/tunable/GemmMxUtils.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <string>
+
+namespace at::cuda::tunable {
+
+#ifdef USE_ROCM
+static bool IsGfx950Device() {
+  // Single static check - only evaluated once
+  static bool is_gfx950 = []() {
+    auto device = at::cuda::current_device();
+    hipDeviceProp_t* prop = at::cuda::getDeviceProperties(device);
+    return (std::string(prop->gcnArchName) == "gfx950");
+  }();
+  return is_gfx950;
+}
+#endif
+
+// Helper function to validate MX format requirements
+static bool ValidateMXFormatRequirements(int64_t m, int64_t n, int64_t k) {
+  constexpr int32_t required_block_size = 32;
+  return (m % required_block_size == 0) &&
+         (n % required_block_size == 0) &&
+         (k % required_block_size == 0);
+}
+
+} // namespace at::cuda::tunable
+
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -17,6 +17,7 @@
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>
 #include <ATen/native/cuda/RowwiseScaledMM.h>
+#include <ATen/cuda/tunable/GemmMxUtils.h>
 #include <ATen/native/cuda/ScaledGroupMM.h>
 #include <ATen/native/cuda/GroupMM.h>
 
@@ -89,7 +90,8 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
   if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
     transpose_tensor = false;
     return resolve_conj_if_indicated(tensor, true);
-  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
+  } else if ((tensor_strides[1] == 1) &&
+    (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
     transpose_tensor = true;
     return resolve_conj_if_indicated(tensor, true);
   } else {
@@ -1104,6 +1106,7 @@ ScalingType get_scaling_type(
 
 } // namespace
 
+
 // Computes matrix multiply + bias while applying scaling to input and output matrices
 // Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
 // If output matrix type is 16 or 32-bit type, scale_result is not applied.
@@ -1226,17 +1229,37 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   }
 #else
   if (scaling_choice == ScalingType::RowWise) {
-    // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
+    // For ROCm, match behavior of f8f8bf16_rowwise type checking
     Tensor b = mat2;
     if (_scaled_mm_is_fnuz()) {
       TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz);
     }
     else {
       TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
     }
-    // Until more than bf16 is supported.
+    // Until more than bf16 is supported
     TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
-         "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
+         "hipblaslt rowwise _scaled_mm only supports BFloat16 output");
+  }
+  else if (scaling_choice == ScalingType::BlockWise) {
+    TORCH_CHECK(mat1.scalar_type() == at::kFloat8_e8m0fnu &&
+                mat2.scalar_type() == at::kFloat8_e8m0fnu,
+                "Block-wise scaling requires both matrices to be Float8_e8m0fnu type");
+
+#if ROCM_VERSION >= 60500
+    TORCH_CHECK(at::cuda::tunable::IsGfx950Device(),
+               "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
+               mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
+               "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
+                out.scalar_type() == ScalarType::Half,
+                "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 6.5 or later");
+#endif
   }
 #endif
 
@@ -1315,10 +1338,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
       params.k = args.k;
       params.a = args.mata->data_ptr();
       params.a_scale_ptr = scale_a.data_ptr();
+      params.a_scale_dtype = scale_a.scalar_type();
       params.lda = args.lda;
       params.a_dtype = args.mata->scalar_type();
       params.b = args.matb->data_ptr();
       params.b_scale_ptr = scale_b.data_ptr();
+      params.b_scale_dtype = scale_b.scalar_type();
       params.ldb = args.ldb;
       params.b_dtype = args.matb->scalar_type();
       params.bias_ptr = bias ? bias->data_ptr(): nullptr;
@@ -1377,6 +1402,27 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
         scaling_choice == ScalingType::RowWise);
   }
 
+  // Add MX format validation for gfx950
+  if (scaling_choice == ScalingType::RowWise) {
+#ifdef USE_ROCM
+    if (at::cuda::tunable::IsGfx950Device()) {
+      // Validate matrix dimensions for MX format
+      TORCH_CHECK(at::cuda::tunable::ValidateMXFormatRequirements(mat1.size(0), mat2.size(1), mat1.size(1)),
+                 "For MX format on gfx950, matrix dimensions must be multiples of 32. ",
+                 "Got dimensions: ", mat1.sizes(), " x ", mat2.sizes());
+
+      // Validate data types for MX format
+      TORCH_CHECK(mat1.scalar_type() == at::kFloat8_e8m0fnu &&
+                 mat2.scalar_type() == at::kFloat8_e8m0fnu,
+                 "MX format requires Float8_e8m0fnu type for both input matrices");
+
+      TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
+                 out.scalar_type() == ScalarType::Half,
+                 "MX format only supports BFloat16 or Half output types");
+    }
+#endif
+  }
+
   return out;
 }
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
@@ -104,7 +104,14 @@ def evaluate_platform_supports_fp8():
 
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 
-PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater)
+def _platform_supports_mx_gemm():
+    if TEST_CUDA:
+        return SM100OrLater
+    if TEST_WITH_ROCM:
+        return torch.cuda.get_device_properties(torch.cuda.current_device(0)).name.startswith('gfx950')
+    return False
+
+PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: _platform_supports_mx_gemm())
 
 if TEST_NUMBA:
     try:
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -7339,6 +7339,9 @@
         ("CUBLASLT_MATMUL_DESC_D_SCALE_POINTER", ("HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_AMAX_D_POINTER", ("HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),