[ROCm] code cleanup of architecture checks (pytorch#150473)

apakbin · pytorchmergebot · commit e6e07ec1cf0b · 2025-04-03T09:51:06.000Z
This PR replaces several calls to `at::cuda::getCurrentDeviceProperties()->gcnArchName` and `at::cuda::getDeviceProperties(device_index)->gcnArchName` when checking to see if the GPU architecture is in a certain list. Pull Request resolved: pytorch#150473 Approved by: https://github.com/jeffdaily, https://github.com/cyyever
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -340,7 +340,7 @@ at::BlasBackend Context::blasPreferredBackend() {
 #endif
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
-        if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
+        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
           return false;
         }
       }
@@ -366,7 +366,7 @@ at::BlasBackend Context::blasPreferredBackend() {
 #endif
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
-        if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
+        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
           TORCH_WARN_ONCE(
             "Attempting to use hipBLASLt on an unsupported architecture! "
             "Overriding blas backend to hipblas");
@@ -419,7 +419,7 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
           "gfx90a",  "gfx942"
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
-        if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
+        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
           TORCH_WARN_ONCE(
             "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
           return true;
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -1085,9 +1085,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
   }
 #if defined(USE_ROCM) && !defined(_MSC_VER)
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-    auto dprops = at::cuda::getCurrentDeviceProperties();
-    c10::string_view arch(dprops->gcnArchName);
-    if (arch == "gfx1100") { //no CK GEMM version for gfx1100
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
       gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
     } else{
       at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -124,9 +124,7 @@ size_t parseChosenWorkspaceSize() {
     val = getenv("ROCBLAS_WORKSPACE_CONFIG");
   }
   /* 32MiB default, 128MiB for MI300 */
-  cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
-  std::string device_arch = properties->gcnArchName;
-  const bool gfx94 = device_arch.find("gfx94") != std::string::npos;
+  const bool gfx94 = at::detail::getCUDAHooks().isGPUArch({"gfx94"});
   const size_t default_size = gfx94 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
 #else
   /* :4096:2:16:8 default, 32MiB for Hopper */
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -448,8 +448,14 @@ DeviceIndex CUDAHooks::getCurrentDevice() const {
 }
 
 #ifdef USE_ROCM
-bool CUDAHooks::isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const {
-  hipDeviceProp_t* prop = at::cuda::getDeviceProperties(device_index);
+bool CUDAHooks::isGPUArch(const std::vector<std::string>& archs, DeviceIndex device_index) const {
+  hipDeviceProp_t* prop;
+  if (device_index == -1){
+      prop = at::cuda::getCurrentDeviceProperties();
+  } else {
+      prop = at::cuda::getDeviceProperties(device_index);
+  }
+
   std::string device_arch = prop->gcnArchName;
   for (std::string arch : archs) {
       size_t substring = device_arch.find(arch);
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -57,7 +57,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   DeviceIndex getCurrentDevice() const override;
 
 #ifdef USE_ROCM
-  bool isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const override;
+  bool isGPUArch(const std::vector<std::string>& archs, DeviceIndex device_index = -1) const override;
 #endif
   void deviceSynchronize(DeviceIndex device_index) const override;
 };
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -196,7 +196,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
   }
 
 #ifdef USE_ROCM
-  virtual bool isGPUArch(DeviceIndex /*device_index*/, const std::vector<std::string>& /*archs*/) const {
+  virtual bool isGPUArch(const std::vector<std::string>& /*archs*/, DeviceIndex = -1 /*device_index*/) const {
     TORCH_CHECK(false, "Cannot check GPU arch without ATen_cuda library. ", CUDA_HELP);
   }
 #endif
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -265,8 +265,6 @@ static bool getDisableAddmmCudaLt() {
 
 #ifdef USE_ROCM
 static bool isSupportedHipLtROCmArch(int index) {
-    hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
-    std::string device_arch = prop->gcnArchName;
     static const std::vector<std::string> archs = {
         "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
@@ -276,13 +274,7 @@ static bool isSupportedHipLtROCmArch(int index) {
         "gfx950"
 #endif
     };
-    for (std::string arch : archs) {
-        size_t substring = device_arch.find(arch);
-        if (substring != std::string::npos) {
-            return true;
-        }
-    }
-    return false;
+    return at::detail::getCUDAHooks().isGPUArch(archs, index);
 }
 #endif
 
@@ -939,9 +931,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
 }
 
 static bool _scaled_mm_allowed_device() {
-    auto dprops = at::cuda::getCurrentDeviceProperties();
 #ifdef USE_ROCM
-    std::string device_arch = dprops->gcnArchName;
     static const std::vector<std::string> archs = {
         "gfx942",
 #if ROCM_VERSION >= 60300
@@ -951,30 +941,16 @@ static bool _scaled_mm_allowed_device() {
         "gfx950"
 #endif
     };
-    for (std::string arch : archs) {
-        size_t substring = device_arch.find(arch);
-        if (substring != std::string::npos) {
-            return true;
-        }
-    }
-    return false;
+    return at::detail::getCUDAHooks().isGPUArch(archs);
 #else
+    auto dprops = at::cuda::getCurrentDeviceProperties();
     return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
 #endif
 }
 
 #ifdef USE_ROCM
 static bool _scaled_mm_is_fnuz() {
-    auto dprops = at::cuda::getCurrentDeviceProperties();
-    std::string device_arch = dprops->gcnArchName;
-    static const std::vector<std::string> archs = {"gfx942"};
-    for (std::string arch : archs) {
-        size_t substring = device_arch.find(arch);
-        if (substring != std::string::npos) {
-            return true;
-        }
-    }
-    return false;
+    return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
 }
 #endif
 
diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
@@ -135,16 +135,7 @@ template<typename T, uint32_t Rank>
 using VecT = T __attribute__((ext_vector_type(Rank)));
 
 static bool isCDNA2orLater(int index) {
-    hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
-    std::string device_arch = prop->gcnArchName;
-    static const std::vector<std::string> archs = {"gfx90a", "gfx942"};
-    for (std::string arch : archs) {
-        size_t substring = device_arch.find(arch);
-        if (substring != std::string::npos) {
-            return true;
-        }
-    }
-    return false;
+    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942"}, index);
 }
 
 #else
diff --git a/aten/src/ATen/native/hip/ck_gemm_half.hip b/aten/src/ATen/native/hip/ck_gemm_half.hip
@@ -598,9 +598,7 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 
 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  c10::string_view arch(dprops->gcnArchName);
-  if (arch == "gfx1100") {
+  if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
     dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
   } else{
     dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));

Original file line number	Diff line number	Diff line change
`@@ -196,7 +196,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {`
`196`	`196`	`}`
`197`	`197`
`198`	`198`	`#ifdef USE_ROCM`
`199`		`- virtual bool isGPUArch(DeviceIndex /device_index/, const std::vector<std::string>& /archs/) const {`
	`199`	`+ virtual bool isGPUArch(const std::vector<std::string>& /archs/, DeviceIndex = -1 /device_index/) const {`
`200`	`200`	`TORCH_CHECK(false, "Cannot check GPU arch without ATen_cuda library. ", CUDA_HELP);`
`201`	`201`	`}`
`202`	`202`	`#endif`