ROCm
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/quantization/fp8/amd/hip_float8_impl.h
Lines changed: 1 addition & 2 deletions b/‎csrc/quantization/fp8/amd/hip_float8_impl.h
Lines changed: 1 addition & 2 deletions
diff --git a/‎csrc/rocm/attention.cu
Lines changed: 1 addition & 2 deletions b/‎csrc/rocm/attention.cu
Lines changed: 1 addition & 2 deletions
diff --git a/‎csrc/rocm/custom.cu
Lines changed: 18 additions & 0 deletions b/‎csrc/rocm/custom.cu
Lines changed: 18 additions & 0 deletions
@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
 
@@ -1,7 +1,6 @@
 #pragma once
 
-#if defined(__HIPCC__) && \
-    (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if defined(__HIPCC__) && defined(__gfx942__)
   #define __HIP__MI300__
 #endif
 
 
@@ -24,8 +24,7 @@
 #include "../attention/dtype_fp8.cuh"
 #include "../quantization/fp8/amd/quant_utils.cuh"
 
-#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx940__) || \
-                           defined(__gfx941__) || defined(__gfx942__))
+#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
   #define __HIP__MI300_MI250__
 #endif
 
 
@@ -48,6 +48,24 @@ void wvSpltK(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
            at::cuda::getCurrentCUDAStream(), CuCount);
 }
 
+void wvSpltKQ_(void* in_a, void* in_b, void* out_c, void* scale_a,
+               void* scale_b, const int M, const int K, const int Kp,
+               const int N, const int Otp_in, cudaStream_t stream,
+               const int CuCount);
+
+void wvSpltKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+              at::Tensor& scale_a, at::Tensor& scale_b, const int64_t N_in,
+              const int64_t Otp_in, const int64_t CuCount) {
+  auto M = in_a.size(0);
+  auto K = in_a.size(1);
+  auto Kp = in_a.stride(0);
+  int N = N_in;
+  int Otp = Otp_in;
+  wvSpltKQ_(in_a.data_ptr(), in_b.data_ptr(), out_c.data_ptr(),
+            scale_a.data_ptr(), scale_b.data_ptr(), M, K, Kp, N, Otp,
+            at::cuda::getCurrentCUDAStream(), CuCount);
+}
+
 void LLGemmZZ(void* in_a, void* in_b, void* out_c, const int M, const int K,
               cudaStream_t stream, const int solidx);
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")`
`34`	`34`	`set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")`
`35`	`35`
`36`	`36`	`# Supported AMD GPU architectures.`
`37`		`-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")`
	`37`	`+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")`
`38`	`38`
`39`	`39`	`#`
`40`	`40`	`# Supported/expected torch versions for CUDA/ROCm.`