CUDA: fixed mmq build issues

JohannesGaessler · JohannesGaessler · commit 305b3049716d · 2023-07-30T12:34:18.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -277,10 +277,14 @@ if (LLAMA_CUBLAS)
         endif()
 
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        # 52 == lowest CUDA 12 standard
+        # 60 == f16 CUDA intrinsics
+        # 61 == integer CUDA intrinsics
+        # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
         if (LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
         else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
         endif()
     endif()
     message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
@@ -6,3 +6,6 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
   add_dependencies(${TARGET} BUILD_INFO)
 endif()
+if(LLAMA_CUBLAS AND CMAKE_BUILD_TYPE STREQUAL "Release")
+  add_compile_definitions(GGML_CUDA_CUBLAS) # DOES NOT WORK
+endif()
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -3536,9 +3536,7 @@ static size_t g_scratch_offset = 0;
 
 static int g_device_count = -1;
 static int g_main_device = 0;
-#ifndef GGML_CUDA_FORCE_DMMV
 static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
-#endif
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
 
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -3561,9 +3559,7 @@ void ggml_init_cublas() {
             g_tensor_split[id] = total_vram;
             total_vram += prop.totalGlobalMem;
 
-#ifndef GGML_CUDA_FORCE_DMMV
             g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
-#endif
         }
         for (int id = 0; id < g_device_count; ++id) {
             g_tensor_split[id] /= total_vram;
@@ -3916,6 +3912,7 @@ inline void ggml_cuda_op_mul_mat_vec(
 
 #ifdef GGML_CUDA_FORCE_DMMV
     const bool use_mul_mat_vec_q = false;
+    (void) g_compute_capabilities[0];
 #else
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -4659,8 +4656,16 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
         } else {
 #ifdef GGML_CUDA_CUBLAS
             const bool use_mul_mat_q = false;
+            (void) g_compute_capabilities[0];
 #else
-            const bool use_mul_mat_q = ggml_is_quantized(src0->type);
+            int min_compute_capability = 1000000;
+            for (int id = 0; id < g_device_count; ++id) {
+                if (min_compute_capability > g_compute_capabilities[id]) {
+                    min_compute_capability = g_compute_capabilities[id];
+                }
+            }
+
+            const bool use_mul_mat_q = ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A;
 #endif // GGML_CUDA_CUBLAS
             if (use_mul_mat_q) {
                 ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);