enable mxfp8_cublas recipe in roofline script

vkuzo · vkuzo · commit 18fdd8c537c0 · 2025-03-05T11:48:50.000-08:00
Summary: Enables us to see roofline vs actual performance of this recipe Test Plan: ``` python benchmarks/float8/float8_roofline.py ~/local/tmp/20250305_test --mx_recipe_name mxfp8_cublas ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 1e5a431 ghstack-comment-id: 2701761490 Pull Request resolved: #1843
diff --git a/benchmarks/float8/float8_roofline.py b/benchmarks/float8/float8_roofline.py
@@ -184,8 +184,11 @@ def get_gemm_times(
         elif float8_recipe_name in ("rowwise", "rowwise_with_gw_hp"):
             scale_a = torch.ones(M, 1, device=device)
             scale_b = torch.ones(1, N, device=device)
+        elif mx_recipe_name == "mxfp8_cublas":
+            scale_a = torch.ones(M, K // 32, device=device, dtype=torch.float8_e8m0fnu)
+            scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
         else:
-            assert False, "TODO add mx gemm here"
+            assert False, "TODO add cutlass mx gemm here"
 
         def do_matmul(A, B):
             return torch._scaled_mm(
diff --git a/torchao/testing/float8/roofline_utils.py b/torchao/testing/float8/roofline_utils.py
@@ -165,7 +165,11 @@ def get_tensor_memory_traffic_ovhd_s(
             assert False, "unsupported"
 
     else:
-        assert mx_recipe_name in ("mxfp8_emulated", "mxfp8_cutlass"), "unsupported"
+        assert mx_recipe_name in (
+            "mxfp8_emulated",
+            "mxfp8_cutlass",
+            "mxfp8_cublas",
+        ), "unsupported"
 
         if tensor_role == "weight":
             # x_bf16 = ...
@@ -219,7 +223,11 @@ def get_individual_gemm_time_sympy(
     num_writes = M * N
 
     if mx_recipe_name is not None:
-        assert mx_recipe_name in ("mxfp8_emulated", "mxfp8_cutlass"), "unsupported"
+        assert mx_recipe_name in (
+            "mxfp8_emulated",
+            "mxfp8_cutlass",
+            "mxfp8_cublas",
+        ), "unsupported"
         assert dtype in (torch.float8_e4m3fn, torch.float8_e5m2), "unsupported"
         # adjust reads for MX scaling
         block_size = 32