@@ -3536,9 +3536,7 @@ static size_t g_scratch_offset = 0;
3536
3536
3537
3537
static int g_device_count = -1 ;
3538
3538
static int g_main_device = 0 ;
3539
- #ifndef GGML_CUDA_FORCE_DMMV
3540
3539
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
3541
- #endif
3542
3540
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
3543
3541
3544
3542
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
@@ -3561,9 +3559,7 @@ void ggml_init_cublas() {
3561
3559
g_tensor_split[id] = total_vram;
3562
3560
total_vram += prop.totalGlobalMem ;
3563
3561
3564
- #ifndef GGML_CUDA_FORCE_DMMV
3565
3562
g_compute_capabilities[id] = 100 *prop.major + 10 *prop.minor ;
3566
- #endif
3567
3563
}
3568
3564
for (int id = 0 ; id < g_device_count; ++id) {
3569
3565
g_tensor_split[id] /= total_vram;
@@ -3916,6 +3912,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3916
3912
3917
3913
#ifdef GGML_CUDA_FORCE_DMMV
3918
3914
const bool use_mul_mat_vec_q = false ;
3915
+ (void ) g_compute_capabilities[0 ];
3919
3916
#else
3920
3917
int id;
3921
3918
CUDA_CHECK (cudaGetDevice (&id));
@@ -4659,8 +4656,16 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
4659
4656
} else {
4660
4657
#ifdef GGML_CUDA_CUBLAS
4661
4658
const bool use_mul_mat_q = false ;
4659
+ (void ) g_compute_capabilities[0 ];
4662
4660
#else
4663
- const bool use_mul_mat_q = ggml_is_quantized (src0->type );
4661
+ int min_compute_capability = 1000000 ;
4662
+ for (int id = 0 ; id < g_device_count; ++id) {
4663
+ if (min_compute_capability > g_compute_capabilities[id]) {
4664
+ min_compute_capability = g_compute_capabilities[id];
4665
+ }
4666
+ }
4667
+
4668
+ const bool use_mul_mat_q = ggml_is_quantized (src0->type ) && min_compute_capability >= MIN_CC_DP4A;
4664
4669
#endif // GGML_CUDA_CUBLAS
4665
4670
if (use_mul_mat_q) {
4666
4671
ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_q, false , false );
0 commit comments