Skip to content

Commit 305b304

Browse files
CUDA: fixed mmq build issues
1 parent 11f3ca0 commit 305b304

File tree

3 files changed

+19
-7
lines changed

3 files changed

+19
-7
lines changed

CMakeLists.txt

+6-2
Original file line numberDiff line numberDiff line change
@@ -277,10 +277,14 @@ if (LLAMA_CUBLAS)
277277
endif()
278278

279279
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
280+
# 52 == lowest CUDA 12 standard
281+
# 60 == f16 CUDA intrinsics
282+
# 61 == integer CUDA intrinsics
283+
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
280284
if (LLAMA_CUDA_DMMV_F16)
281-
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
285+
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
282286
else()
283-
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
287+
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
284288
endif()
285289
endif()
286290
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

examples/perplexity/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11)
66
if(TARGET BUILD_INFO)
77
add_dependencies(${TARGET} BUILD_INFO)
88
endif()
9+
if(LLAMA_CUBLAS AND CMAKE_BUILD_TYPE STREQUAL "Release")
10+
add_compile_definitions(GGML_CUDA_CUBLAS) # DOES NOT WORK
11+
endif()

ggml-cuda.cu

+10-5
Original file line numberDiff line numberDiff line change
@@ -3536,9 +3536,7 @@ static size_t g_scratch_offset = 0;
35363536

35373537
static int g_device_count = -1;
35383538
static int g_main_device = 0;
3539-
#ifndef GGML_CUDA_FORCE_DMMV
35403539
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
3541-
#endif
35423540
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
35433541

35443542
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -3561,9 +3559,7 @@ void ggml_init_cublas() {
35613559
g_tensor_split[id] = total_vram;
35623560
total_vram += prop.totalGlobalMem;
35633561

3564-
#ifndef GGML_CUDA_FORCE_DMMV
35653562
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
3566-
#endif
35673563
}
35683564
for (int id = 0; id < g_device_count; ++id) {
35693565
g_tensor_split[id] /= total_vram;
@@ -3916,6 +3912,7 @@ inline void ggml_cuda_op_mul_mat_vec(
39163912

39173913
#ifdef GGML_CUDA_FORCE_DMMV
39183914
const bool use_mul_mat_vec_q = false;
3915+
(void) g_compute_capabilities[0];
39193916
#else
39203917
int id;
39213918
CUDA_CHECK(cudaGetDevice(&id));
@@ -4659,8 +4656,16 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
46594656
} else {
46604657
#ifdef GGML_CUDA_CUBLAS
46614658
const bool use_mul_mat_q = false;
4659+
(void) g_compute_capabilities[0];
46624660
#else
4663-
const bool use_mul_mat_q = ggml_is_quantized(src0->type);
4661+
int min_compute_capability = 1000000;
4662+
for (int id = 0; id < g_device_count; ++id) {
4663+
if (min_compute_capability > g_compute_capabilities[id]) {
4664+
min_compute_capability = g_compute_capabilities[id];
4665+
}
4666+
}
4667+
4668+
const bool use_mul_mat_q = ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A;
46644669
#endif // GGML_CUDA_CUBLAS
46654670
if (use_mul_mat_q) {
46664671
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);

0 commit comments

Comments
 (0)