87
87
#define CC_OFFSET_AMD 1000000
88
88
#define CC_RDNA2 (CC_OFFSET_AMD + 1030 )
89
89
90
- // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
- // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
- // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93
- // - 7B quantum model: +100-200 MB
94
- // - 13B quantum model: +200-400 MB
95
- //
96
- // #define GGML_CUDA_FORCE_MMQ
97
-
98
- // TODO: improve this to be correct for more hardware
99
- // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100
- // probably other such cases, and not sure what happens on AMD hardware
101
- #if !defined(GGML_CUDA_FORCE_MMQ)
102
- #define CUDA_USE_TENSOR_CORES
103
- #endif
104
-
105
- // max batch size to use MMQ kernels when tensor cores are available
106
- #define MMQ_MAX_BATCH_SIZE 32
107
-
108
90
#if defined(GGML_USE_HIPBLAS)
109
91
#define __CUDA_ARCH__ 1300
110
92
@@ -488,6 +470,7 @@ static int g_device_count = -1;
488
470
static int g_main_device = 0 ;
489
471
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
490
472
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
473
+ static bool g_mul_mat_q = true ;
491
474
492
475
static void * g_scratch_buffer = nullptr ;
493
476
static size_t g_scratch_size = 0 ; // disabled by default
@@ -3580,15 +3563,9 @@ static __device__ __forceinline__ void mul_mat_q(
3580
3563
#define MMQ_X_Q4_0_RDNA1 64
3581
3564
#define MMQ_Y_Q4_0_RDNA1 64
3582
3565
#define NWARPS_Q4_0_RDNA1 8
3583
- #if defined(CUDA_USE_TENSOR_CORES)
3584
- #define MMQ_X_Q4_0_AMPERE 4
3585
- #define MMQ_Y_Q4_0_AMPERE 32
3586
- #define NWARPS_Q4_0_AMPERE 4
3587
- #else
3588
3566
#define MMQ_X_Q4_0_AMPERE 64
3589
3567
#define MMQ_Y_Q4_0_AMPERE 128
3590
3568
#define NWARPS_Q4_0_AMPERE 4
3591
- #endif
3592
3569
#define MMQ_X_Q4_0_PASCAL 64
3593
3570
#define MMQ_Y_Q4_0_PASCAL 64
3594
3571
#define NWARPS_Q4_0_PASCAL 8
@@ -3647,15 +3624,9 @@ template <bool need_check> static __global__ void
3647
3624
#define MMQ_X_Q4_1_RDNA1 64
3648
3625
#define MMQ_Y_Q4_1_RDNA1 64
3649
3626
#define NWARPS_Q4_1_RDNA1 8
3650
- #if defined(CUDA_USE_TENSOR_CORES)
3651
- #define MMQ_X_Q4_1_AMPERE 4
3652
- #define MMQ_Y_Q4_1_AMPERE 32
3653
- #define NWARPS_Q4_1_AMPERE 4
3654
- #else
3655
3627
#define MMQ_X_Q4_1_AMPERE 64
3656
3628
#define MMQ_Y_Q4_1_AMPERE 128
3657
3629
#define NWARPS_Q4_1_AMPERE 4
3658
- #endif
3659
3630
#define MMQ_X_Q4_1_PASCAL 64
3660
3631
#define MMQ_Y_Q4_1_PASCAL 64
3661
3632
#define NWARPS_Q4_1_PASCAL 8
@@ -3716,15 +3687,9 @@ template <bool need_check> static __global__ void
3716
3687
#define MMQ_X_Q5_0_RDNA1 64
3717
3688
#define MMQ_Y_Q5_0_RDNA1 64
3718
3689
#define NWARPS_Q5_0_RDNA1 8
3719
- #if defined(CUDA_USE_TENSOR_CORES)
3720
- #define MMQ_X_Q5_0_AMPERE 4
3721
- #define MMQ_Y_Q5_0_AMPERE 32
3722
- #define NWARPS_Q5_0_AMPERE 4
3723
- #else
3724
3690
#define MMQ_X_Q5_0_AMPERE 128
3725
3691
#define MMQ_Y_Q5_0_AMPERE 64
3726
3692
#define NWARPS_Q5_0_AMPERE 4
3727
- #endif
3728
3693
#define MMQ_X_Q5_0_PASCAL 64
3729
3694
#define MMQ_Y_Q5_0_PASCAL 64
3730
3695
#define NWARPS_Q5_0_PASCAL 8
@@ -3783,15 +3748,9 @@ template <bool need_check> static __global__ void
3783
3748
#define MMQ_X_Q5_1_RDNA1 64
3784
3749
#define MMQ_Y_Q5_1_RDNA1 64
3785
3750
#define NWARPS_Q5_1_RDNA1 8
3786
- #if defined(CUDA_USE_TENSOR_CORES)
3787
- #define MMQ_X_Q5_1_AMPERE 4
3788
- #define MMQ_Y_Q5_1_AMPERE 32
3789
- #define NWARPS_Q5_1_AMPERE 4
3790
- #else
3791
3751
#define MMQ_X_Q5_1_AMPERE 128
3792
3752
#define MMQ_Y_Q5_1_AMPERE 64
3793
3753
#define NWARPS_Q5_1_AMPERE 4
3794
- #endif
3795
3754
#define MMQ_X_Q5_1_PASCAL 64
3796
3755
#define MMQ_Y_Q5_1_PASCAL 64
3797
3756
#define NWARPS_Q5_1_PASCAL 8
@@ -3850,15 +3809,9 @@ mul_mat_q5_1(
3850
3809
#define MMQ_X_Q8_0_RDNA1 64
3851
3810
#define MMQ_Y_Q8_0_RDNA1 64
3852
3811
#define NWARPS_Q8_0_RDNA1 8
3853
- #if defined(CUDA_USE_TENSOR_CORES)
3854
- #define MMQ_X_Q8_0_AMPERE 4
3855
- #define MMQ_Y_Q8_0_AMPERE 32
3856
- #define NWARPS_Q8_0_AMPERE 4
3857
- #else
3858
3812
#define MMQ_X_Q8_0_AMPERE 128
3859
3813
#define MMQ_Y_Q8_0_AMPERE 64
3860
3814
#define NWARPS_Q8_0_AMPERE 4
3861
- #endif
3862
3815
#define MMQ_X_Q8_0_PASCAL 64
3863
3816
#define MMQ_Y_Q8_0_PASCAL 64
3864
3817
#define NWARPS_Q8_0_PASCAL 8
@@ -3917,15 +3870,9 @@ template <bool need_check> static __global__ void
3917
3870
#define MMQ_X_Q2_K_RDNA1 128
3918
3871
#define MMQ_Y_Q2_K_RDNA1 32
3919
3872
#define NWARPS_Q2_K_RDNA1 8
3920
- #if defined(CUDA_USE_TENSOR_CORES)
3921
- #define MMQ_X_Q2_K_AMPERE 4
3922
- #define MMQ_Y_Q2_K_AMPERE 32
3923
- #define NWARPS_Q2_K_AMPERE 4
3924
- #else
3925
3873
#define MMQ_X_Q2_K_AMPERE 64
3926
3874
#define MMQ_Y_Q2_K_AMPERE 128
3927
3875
#define NWARPS_Q2_K_AMPERE 4
3928
- #endif
3929
3876
#define MMQ_X_Q2_K_PASCAL 64
3930
3877
#define MMQ_Y_Q2_K_PASCAL 64
3931
3878
#define NWARPS_Q2_K_PASCAL 8
@@ -3984,15 +3931,9 @@ mul_mat_q2_K(
3984
3931
#define MMQ_X_Q3_K_RDNA1 32
3985
3932
#define MMQ_Y_Q3_K_RDNA1 128
3986
3933
#define NWARPS_Q3_K_RDNA1 8
3987
- #if defined(CUDA_USE_TENSOR_CORES)
3988
- #define MMQ_X_Q3_K_AMPERE 4
3989
- #define MMQ_Y_Q3_K_AMPERE 32
3990
- #define NWARPS_Q3_K_AMPERE 4
3991
- #else
3992
3934
#define MMQ_X_Q3_K_AMPERE 128
3993
3935
#define MMQ_Y_Q3_K_AMPERE 128
3994
3936
#define NWARPS_Q3_K_AMPERE 4
3995
- #endif
3996
3937
#define MMQ_X_Q3_K_PASCAL 64
3997
3938
#define MMQ_Y_Q3_K_PASCAL 64
3998
3939
#define NWARPS_Q3_K_PASCAL 8
@@ -4053,15 +3994,9 @@ template <bool need_check> static __global__ void
4053
3994
#define MMQ_X_Q4_K_RDNA1 32
4054
3995
#define MMQ_Y_Q4_K_RDNA1 64
4055
3996
#define NWARPS_Q4_K_RDNA1 8
4056
- #if defined(CUDA_USE_TENSOR_CORES)
4057
- #define MMQ_X_Q4_K_AMPERE 4
4058
- #define MMQ_Y_Q4_K_AMPERE 32
4059
- #define NWARPS_Q4_K_AMPERE 4
4060
- #else
4061
3997
#define MMQ_X_Q4_K_AMPERE 64
4062
3998
#define MMQ_Y_Q4_K_AMPERE 128
4063
3999
#define NWARPS_Q4_K_AMPERE 4
4064
- #endif
4065
4000
#define MMQ_X_Q4_K_PASCAL 64
4066
4001
#define MMQ_Y_Q4_K_PASCAL 64
4067
4002
#define NWARPS_Q4_K_PASCAL 8
@@ -4122,15 +4057,9 @@ template <bool need_check> static __global__ void
4122
4057
#define MMQ_X_Q5_K_RDNA1 32
4123
4058
#define MMQ_Y_Q5_K_RDNA1 64
4124
4059
#define NWARPS_Q5_K_RDNA1 8
4125
- #if defined(CUDA_USE_TENSOR_CORES)
4126
- #define MMQ_X_Q5_K_AMPERE 4
4127
- #define MMQ_Y_Q5_K_AMPERE 32
4128
- #define NWARPS_Q5_K_AMPERE 4
4129
- #else
4130
4060
#define MMQ_X_Q5_K_AMPERE 64
4131
4061
#define MMQ_Y_Q5_K_AMPERE 128
4132
4062
#define NWARPS_Q5_K_AMPERE 4
4133
- #endif
4134
4063
#define MMQ_X_Q5_K_PASCAL 64
4135
4064
#define MMQ_Y_Q5_K_PASCAL 64
4136
4065
#define NWARPS_Q5_K_PASCAL 8
@@ -4189,15 +4118,9 @@ mul_mat_q5_K(
4189
4118
#define MMQ_X_Q6_K_RDNA1 32
4190
4119
#define MMQ_Y_Q6_K_RDNA1 64
4191
4120
#define NWARPS_Q6_K_RDNA1 8
4192
- #if defined(CUDA_USE_TENSOR_CORES)
4193
- #define MMQ_X_Q6_K_AMPERE 4
4194
- #define MMQ_Y_Q6_K_AMPERE 32
4195
- #define NWARPS_Q6_K_AMPERE 4
4196
- #else
4197
4121
#define MMQ_X_Q6_K_AMPERE 64
4198
4122
#define MMQ_Y_Q6_K_AMPERE 64
4199
4123
#define NWARPS_Q6_K_AMPERE 4
4200
- #endif
4201
4124
#define MMQ_X_Q6_K_PASCAL 64
4202
4125
#define MMQ_Y_Q6_K_PASCAL 64
4203
4126
#define NWARPS_Q6_K_PASCAL 8
@@ -5805,16 +5728,6 @@ void ggml_init_cublas() {
5805
5728
CUDA_CHECK (cudaGetDeviceCount (&g_device_count));
5806
5729
GGML_ASSERT (g_device_count <= GGML_CUDA_MAX_DEVICES);
5807
5730
int64_t total_vram = 0 ;
5808
- #if defined(GGML_CUDA_FORCE_MMQ)
5809
- fprintf (stderr, " %s: GGML_CUDA_FORCE_MMQ: yes\n " , __func__);
5810
- #else
5811
- fprintf (stderr, " %s: GGML_CUDA_FORCE_MMQ: no\n " , __func__);
5812
- #endif
5813
- #if defined(CUDA_USE_TENSOR_CORES)
5814
- fprintf (stderr, " %s: CUDA_USE_TENSOR_CORES: yes\n " , __func__);
5815
- #else
5816
- fprintf (stderr, " %s: CUDA_USE_TENSOR_CORES: no\n " , __func__);
5817
- #endif
5818
5731
fprintf (stderr, " %s: found %d " GGML_CUDA_NAME " devices:\n " , __func__, g_device_count);
5819
5732
for (int id = 0 ; id < g_device_count; ++id) {
5820
5733
cudaDeviceProp prop;
@@ -6502,7 +6415,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6502
6415
cublasSgemm (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6503
6416
row_diff, src1_ncols, ne10,
6504
6417
&alpha, src0_ddf_i, ne00,
6505
- src1_ddf_i, ne10,
6418
+ src1_ddf_i, ne10,
6506
6419
&beta, dst_dd_i, ldc));
6507
6420
6508
6421
if (src0_as != 0 ) {
@@ -7250,7 +7163,6 @@ __global__ void k_compute_batched_ptrs(
7250
7163
static void ggml_cuda_mul_mat_mat_batched_cublas (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7251
7164
GGML_ASSERT (!ggml_is_transposed (src0));
7252
7165
GGML_ASSERT (!ggml_is_transposed (src1));
7253
-
7254
7166
GGML_ASSERT (src0->backend != GGML_BACKEND_GPU_SPLIT);
7255
7167
GGML_ASSERT (src0->type == GGML_TYPE_F16);
7256
7168
GGML_ASSERT (src1->type == GGML_TYPE_F32);
@@ -7388,24 +7300,17 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7388
7300
}
7389
7301
7390
7302
static void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7391
- const bool all_on_device =
7392
- (src0->backend == GGML_BACKEND_GPU) &&
7393
- (src1->backend == GGML_BACKEND_GPU) &&
7394
- ( dst->backend == GGML_BACKEND_GPU);
7303
+ bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7304
+ src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
7395
7305
7396
7306
int64_t min_compute_capability = INT_MAX;
7397
7307
for (int64_t id = 0 ; id < g_device_count; ++id) {
7398
- if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1 ] : 1 .0f )) {
7308
+ if (min_compute_capability > g_compute_capabilities[id]
7309
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1 ] : 1 .0f )) {
7399
7310
min_compute_capability = g_compute_capabilities[id];
7400
7311
}
7401
7312
}
7402
7313
7403
- #ifdef CUDA_USE_TENSOR_CORES
7404
- const bool use_tensor_cores = true ;
7405
- #else
7406
- const bool use_tensor_cores = false ;
7407
- #endif
7408
-
7409
7314
// debug helpers
7410
7315
// printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7411
7316
// printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -7414,19 +7319,20 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7414
7319
// printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7415
7320
// printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7416
7321
7417
- if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
7322
+ if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
7418
7323
// KQ single-batch
7419
7324
ggml_cuda_mul_mat_vec_p021 (src0, src1, dst);
7420
- } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous (src0) && !ggml_is_transposed (src1) && src1->ne [1 ] == 1 ) {
7325
+ } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous (src0) && !ggml_is_transposed (src1) && src1->ne [1 ] == 1 ) {
7421
7326
// KQV single-batch
7422
7327
ggml_cuda_mul_mat_vec_nc (src0, src1, dst);
7423
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1)) {
7328
+ } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1-> ne [ 2 ]*src1-> ne [ 3 ] > 1 ) {
7424
7329
// KQ + KQV multi-batch
7425
7330
ggml_cuda_mul_mat_mat_batched_cublas (src0, src1, dst);
7426
7331
} else if (src0->type == GGML_TYPE_F32) {
7427
7332
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false );
7428
7333
} else if (ggml_is_quantized (src0->type ) || src0->type == GGML_TYPE_F16) {
7429
7334
if (src1->ne [1 ] == 1 && src0->ne [0 ] % GGML_CUDA_DMMV_X == 0 ) {
7335
+
7430
7336
#ifdef GGML_CUDA_FORCE_DMMV
7431
7337
const bool use_mul_mat_vec_q = false ;
7432
7338
#else
@@ -7439,15 +7345,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7439
7345
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false );
7440
7346
}
7441
7347
} else {
7442
- bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized (src0->type );
7443
-
7444
- // when tensor cores are available, use them for large batch size
7445
- // ref: https://github.com/ggerganov/llama.cpp/pull/3776
7446
- if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne [1 ] > MMQ_MAX_BATCH_SIZE) {
7447
- use_mul_mat_q = false ;
7448
- }
7449
-
7450
- if (use_mul_mat_q) {
7348
+ if (g_mul_mat_q && ggml_is_quantized (src0->type ) && min_compute_capability >= MIN_CC_DP4A) {
7451
7349
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_q, true );
7452
7350
} else {
7453
7351
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false );
@@ -7801,6 +7699,10 @@ void ggml_cuda_set_main_device(const int main_device) {
7801
7699
}
7802
7700
}
7803
7701
7702
+ void ggml_cuda_set_mul_mat_q (const bool mul_mat_q) {
7703
+ g_mul_mat_q = mul_mat_q;
7704
+ }
7705
+
7804
7706
void ggml_cuda_set_scratch_size (const size_t scratch_size) {
7805
7707
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7806
7708
// it still won't always work as expected, but it's better than nothing
0 commit comments