Skip to content

Commit ed2d4bb

Browse files
committed
Revert "cuda : improve text-generation and batched decoding performance (ggml-org#3776)"
This commit introduces a performance regression on my Tesla P40.
1 parent 5c23503 commit ed2d4bb

File tree

5 files changed

+18
-124
lines changed

5 files changed

+18
-124
lines changed

CMakeLists.txt

-7
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
8282
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
8383
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
8484
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
85-
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
8685
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
8786
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
8887
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
@@ -300,9 +299,6 @@ if (LLAMA_CUBLAS)
300299
if (LLAMA_CUDA_FORCE_DMMV)
301300
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
302301
endif()
303-
if (LLAMA_CUDA_FORCE_MMQ)
304-
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
305-
endif()
306302
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
307303
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
308304
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -403,9 +399,6 @@ if (LLAMA_HIPBLAS)
403399
if (LLAMA_CUDA_FORCE_DMMV)
404400
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
405401
endif()
406-
if (LLAMA_CUDA_FORCE_MMQ)
407-
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
408-
endif()
409402
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
410403
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
411404
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

Makefile

-3
Original file line numberDiff line numberDiff line change
@@ -396,9 +396,6 @@ endif # CUDA_DOCKER_ARCH
396396
ifdef LLAMA_CUDA_FORCE_DMMV
397397
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
398398
endif # LLAMA_CUDA_FORCE_DMMV
399-
ifdef LLAMA_CUDA_FORCE_MMQ
400-
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
401-
endif # LLAMA_CUDA_FORCE_MMQ
402399
ifdef LLAMA_CUDA_DMMV_X
403400
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
404401
else

ggml-cuda.cu

+15-113
Original file line numberDiff line numberDiff line change
@@ -87,24 +87,6 @@
8787
#define CC_OFFSET_AMD 1000000
8888
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
8989

90-
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91-
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92-
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93-
// - 7B quantum model: +100-200 MB
94-
// - 13B quantum model: +200-400 MB
95-
//
96-
//#define GGML_CUDA_FORCE_MMQ
97-
98-
// TODO: improve this to be correct for more hardware
99-
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100-
// probably other such cases, and not sure what happens on AMD hardware
101-
#if !defined(GGML_CUDA_FORCE_MMQ)
102-
#define CUDA_USE_TENSOR_CORES
103-
#endif
104-
105-
// max batch size to use MMQ kernels when tensor cores are available
106-
#define MMQ_MAX_BATCH_SIZE 32
107-
10890
#if defined(GGML_USE_HIPBLAS)
10991
#define __CUDA_ARCH__ 1300
11092

@@ -488,6 +470,7 @@ static int g_device_count = -1;
488470
static int g_main_device = 0;
489471
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
490472
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
473+
static bool g_mul_mat_q = true;
491474

492475
static void * g_scratch_buffer = nullptr;
493476
static size_t g_scratch_size = 0; // disabled by default
@@ -3580,15 +3563,9 @@ static __device__ __forceinline__ void mul_mat_q(
35803563
#define MMQ_X_Q4_0_RDNA1 64
35813564
#define MMQ_Y_Q4_0_RDNA1 64
35823565
#define NWARPS_Q4_0_RDNA1 8
3583-
#if defined(CUDA_USE_TENSOR_CORES)
3584-
#define MMQ_X_Q4_0_AMPERE 4
3585-
#define MMQ_Y_Q4_0_AMPERE 32
3586-
#define NWARPS_Q4_0_AMPERE 4
3587-
#else
35883566
#define MMQ_X_Q4_0_AMPERE 64
35893567
#define MMQ_Y_Q4_0_AMPERE 128
35903568
#define NWARPS_Q4_0_AMPERE 4
3591-
#endif
35923569
#define MMQ_X_Q4_0_PASCAL 64
35933570
#define MMQ_Y_Q4_0_PASCAL 64
35943571
#define NWARPS_Q4_0_PASCAL 8
@@ -3647,15 +3624,9 @@ template <bool need_check> static __global__ void
36473624
#define MMQ_X_Q4_1_RDNA1 64
36483625
#define MMQ_Y_Q4_1_RDNA1 64
36493626
#define NWARPS_Q4_1_RDNA1 8
3650-
#if defined(CUDA_USE_TENSOR_CORES)
3651-
#define MMQ_X_Q4_1_AMPERE 4
3652-
#define MMQ_Y_Q4_1_AMPERE 32
3653-
#define NWARPS_Q4_1_AMPERE 4
3654-
#else
36553627
#define MMQ_X_Q4_1_AMPERE 64
36563628
#define MMQ_Y_Q4_1_AMPERE 128
36573629
#define NWARPS_Q4_1_AMPERE 4
3658-
#endif
36593630
#define MMQ_X_Q4_1_PASCAL 64
36603631
#define MMQ_Y_Q4_1_PASCAL 64
36613632
#define NWARPS_Q4_1_PASCAL 8
@@ -3716,15 +3687,9 @@ template <bool need_check> static __global__ void
37163687
#define MMQ_X_Q5_0_RDNA1 64
37173688
#define MMQ_Y_Q5_0_RDNA1 64
37183689
#define NWARPS_Q5_0_RDNA1 8
3719-
#if defined(CUDA_USE_TENSOR_CORES)
3720-
#define MMQ_X_Q5_0_AMPERE 4
3721-
#define MMQ_Y_Q5_0_AMPERE 32
3722-
#define NWARPS_Q5_0_AMPERE 4
3723-
#else
37243690
#define MMQ_X_Q5_0_AMPERE 128
37253691
#define MMQ_Y_Q5_0_AMPERE 64
37263692
#define NWARPS_Q5_0_AMPERE 4
3727-
#endif
37283693
#define MMQ_X_Q5_0_PASCAL 64
37293694
#define MMQ_Y_Q5_0_PASCAL 64
37303695
#define NWARPS_Q5_0_PASCAL 8
@@ -3783,15 +3748,9 @@ template <bool need_check> static __global__ void
37833748
#define MMQ_X_Q5_1_RDNA1 64
37843749
#define MMQ_Y_Q5_1_RDNA1 64
37853750
#define NWARPS_Q5_1_RDNA1 8
3786-
#if defined(CUDA_USE_TENSOR_CORES)
3787-
#define MMQ_X_Q5_1_AMPERE 4
3788-
#define MMQ_Y_Q5_1_AMPERE 32
3789-
#define NWARPS_Q5_1_AMPERE 4
3790-
#else
37913751
#define MMQ_X_Q5_1_AMPERE 128
37923752
#define MMQ_Y_Q5_1_AMPERE 64
37933753
#define NWARPS_Q5_1_AMPERE 4
3794-
#endif
37953754
#define MMQ_X_Q5_1_PASCAL 64
37963755
#define MMQ_Y_Q5_1_PASCAL 64
37973756
#define NWARPS_Q5_1_PASCAL 8
@@ -3850,15 +3809,9 @@ mul_mat_q5_1(
38503809
#define MMQ_X_Q8_0_RDNA1 64
38513810
#define MMQ_Y_Q8_0_RDNA1 64
38523811
#define NWARPS_Q8_0_RDNA1 8
3853-
#if defined(CUDA_USE_TENSOR_CORES)
3854-
#define MMQ_X_Q8_0_AMPERE 4
3855-
#define MMQ_Y_Q8_0_AMPERE 32
3856-
#define NWARPS_Q8_0_AMPERE 4
3857-
#else
38583812
#define MMQ_X_Q8_0_AMPERE 128
38593813
#define MMQ_Y_Q8_0_AMPERE 64
38603814
#define NWARPS_Q8_0_AMPERE 4
3861-
#endif
38623815
#define MMQ_X_Q8_0_PASCAL 64
38633816
#define MMQ_Y_Q8_0_PASCAL 64
38643817
#define NWARPS_Q8_0_PASCAL 8
@@ -3917,15 +3870,9 @@ template <bool need_check> static __global__ void
39173870
#define MMQ_X_Q2_K_RDNA1 128
39183871
#define MMQ_Y_Q2_K_RDNA1 32
39193872
#define NWARPS_Q2_K_RDNA1 8
3920-
#if defined(CUDA_USE_TENSOR_CORES)
3921-
#define MMQ_X_Q2_K_AMPERE 4
3922-
#define MMQ_Y_Q2_K_AMPERE 32
3923-
#define NWARPS_Q2_K_AMPERE 4
3924-
#else
39253873
#define MMQ_X_Q2_K_AMPERE 64
39263874
#define MMQ_Y_Q2_K_AMPERE 128
39273875
#define NWARPS_Q2_K_AMPERE 4
3928-
#endif
39293876
#define MMQ_X_Q2_K_PASCAL 64
39303877
#define MMQ_Y_Q2_K_PASCAL 64
39313878
#define NWARPS_Q2_K_PASCAL 8
@@ -3984,15 +3931,9 @@ mul_mat_q2_K(
39843931
#define MMQ_X_Q3_K_RDNA1 32
39853932
#define MMQ_Y_Q3_K_RDNA1 128
39863933
#define NWARPS_Q3_K_RDNA1 8
3987-
#if defined(CUDA_USE_TENSOR_CORES)
3988-
#define MMQ_X_Q3_K_AMPERE 4
3989-
#define MMQ_Y_Q3_K_AMPERE 32
3990-
#define NWARPS_Q3_K_AMPERE 4
3991-
#else
39923934
#define MMQ_X_Q3_K_AMPERE 128
39933935
#define MMQ_Y_Q3_K_AMPERE 128
39943936
#define NWARPS_Q3_K_AMPERE 4
3995-
#endif
39963937
#define MMQ_X_Q3_K_PASCAL 64
39973938
#define MMQ_Y_Q3_K_PASCAL 64
39983939
#define NWARPS_Q3_K_PASCAL 8
@@ -4053,15 +3994,9 @@ template <bool need_check> static __global__ void
40533994
#define MMQ_X_Q4_K_RDNA1 32
40543995
#define MMQ_Y_Q4_K_RDNA1 64
40553996
#define NWARPS_Q4_K_RDNA1 8
4056-
#if defined(CUDA_USE_TENSOR_CORES)
4057-
#define MMQ_X_Q4_K_AMPERE 4
4058-
#define MMQ_Y_Q4_K_AMPERE 32
4059-
#define NWARPS_Q4_K_AMPERE 4
4060-
#else
40613997
#define MMQ_X_Q4_K_AMPERE 64
40623998
#define MMQ_Y_Q4_K_AMPERE 128
40633999
#define NWARPS_Q4_K_AMPERE 4
4064-
#endif
40654000
#define MMQ_X_Q4_K_PASCAL 64
40664001
#define MMQ_Y_Q4_K_PASCAL 64
40674002
#define NWARPS_Q4_K_PASCAL 8
@@ -4122,15 +4057,9 @@ template <bool need_check> static __global__ void
41224057
#define MMQ_X_Q5_K_RDNA1 32
41234058
#define MMQ_Y_Q5_K_RDNA1 64
41244059
#define NWARPS_Q5_K_RDNA1 8
4125-
#if defined(CUDA_USE_TENSOR_CORES)
4126-
#define MMQ_X_Q5_K_AMPERE 4
4127-
#define MMQ_Y_Q5_K_AMPERE 32
4128-
#define NWARPS_Q5_K_AMPERE 4
4129-
#else
41304060
#define MMQ_X_Q5_K_AMPERE 64
41314061
#define MMQ_Y_Q5_K_AMPERE 128
41324062
#define NWARPS_Q5_K_AMPERE 4
4133-
#endif
41344063
#define MMQ_X_Q5_K_PASCAL 64
41354064
#define MMQ_Y_Q5_K_PASCAL 64
41364065
#define NWARPS_Q5_K_PASCAL 8
@@ -4189,15 +4118,9 @@ mul_mat_q5_K(
41894118
#define MMQ_X_Q6_K_RDNA1 32
41904119
#define MMQ_Y_Q6_K_RDNA1 64
41914120
#define NWARPS_Q6_K_RDNA1 8
4192-
#if defined(CUDA_USE_TENSOR_CORES)
4193-
#define MMQ_X_Q6_K_AMPERE 4
4194-
#define MMQ_Y_Q6_K_AMPERE 32
4195-
#define NWARPS_Q6_K_AMPERE 4
4196-
#else
41974121
#define MMQ_X_Q6_K_AMPERE 64
41984122
#define MMQ_Y_Q6_K_AMPERE 64
41994123
#define NWARPS_Q6_K_AMPERE 4
4200-
#endif
42014124
#define MMQ_X_Q6_K_PASCAL 64
42024125
#define MMQ_Y_Q6_K_PASCAL 64
42034126
#define NWARPS_Q6_K_PASCAL 8
@@ -5805,16 +5728,6 @@ void ggml_init_cublas() {
58055728
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
58065729
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
58075730
int64_t total_vram = 0;
5808-
#if defined(GGML_CUDA_FORCE_MMQ)
5809-
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
5810-
#else
5811-
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
5812-
#endif
5813-
#if defined(CUDA_USE_TENSOR_CORES)
5814-
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
5815-
#else
5816-
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
5817-
#endif
58185731
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
58195732
for (int id = 0; id < g_device_count; ++id) {
58205733
cudaDeviceProp prop;
@@ -6502,7 +6415,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
65026415
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
65036416
row_diff, src1_ncols, ne10,
65046417
&alpha, src0_ddf_i, ne00,
6505-
src1_ddf_i, ne10,
6418+
src1_ddf_i, ne10,
65066419
&beta, dst_dd_i, ldc));
65076420

65086421
if (src0_as != 0) {
@@ -7250,7 +7163,6 @@ __global__ void k_compute_batched_ptrs(
72507163
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
72517164
GGML_ASSERT(!ggml_is_transposed(src0));
72527165
GGML_ASSERT(!ggml_is_transposed(src1));
7253-
72547166
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
72557167
GGML_ASSERT(src0->type == GGML_TYPE_F16);
72567168
GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7388,24 +7300,17 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
73887300
}
73897301

73907302
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7391-
const bool all_on_device =
7392-
(src0->backend == GGML_BACKEND_GPU) &&
7393-
(src1->backend == GGML_BACKEND_GPU) &&
7394-
( dst->backend == GGML_BACKEND_GPU);
7303+
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7304+
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
73957305

73967306
int64_t min_compute_capability = INT_MAX;
73977307
for (int64_t id = 0; id < g_device_count; ++id) {
7398-
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7308+
if (min_compute_capability > g_compute_capabilities[id]
7309+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
73997310
min_compute_capability = g_compute_capabilities[id];
74007311
}
74017312
}
74027313

7403-
#ifdef CUDA_USE_TENSOR_CORES
7404-
const bool use_tensor_cores = true;
7405-
#else
7406-
const bool use_tensor_cores = false;
7407-
#endif
7408-
74097314
// debug helpers
74107315
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
74117316
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -7414,19 +7319,20 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
74147319
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
74157320
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
74167321

7417-
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7322+
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
74187323
// KQ single-batch
74197324
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7420-
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7325+
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
74217326
// KQV single-batch
74227327
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7423-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7328+
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
74247329
// KQ + KQV multi-batch
74257330
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
74267331
} else if (src0->type == GGML_TYPE_F32) {
74277332
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
74287333
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
74297334
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
7335+
74307336
#ifdef GGML_CUDA_FORCE_DMMV
74317337
const bool use_mul_mat_vec_q = false;
74327338
#else
@@ -7439,15 +7345,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
74397345
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
74407346
}
74417347
} else {
7442-
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
7443-
7444-
// when tensor cores are available, use them for large batch size
7445-
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
7446-
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
7447-
use_mul_mat_q = false;
7448-
}
7449-
7450-
if (use_mul_mat_q) {
7348+
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
74517349
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
74527350
} else {
74537351
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7801,6 +7699,10 @@ void ggml_cuda_set_main_device(const int main_device) {
78017699
}
78027700
}
78037701

7702+
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7703+
g_mul_mat_q = mul_mat_q;
7704+
}
7705+
78047706
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
78057707
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
78067708
// it still won't always work as expected, but it's better than nothing

llama.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -5138,6 +5138,8 @@ static int llama_decode_internal(
51385138
}
51395139
}
51405140

5141+
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
5142+
51415143
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
51425144
if (!lctx.embedding.empty()) {
51435145
embeddings->backend = GGML_BACKEND_CPU;

llama.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ extern "C" {
192192
uint32_t yarn_orig_ctx; // YaRN original context size
193193

194194
// Keep the booleans together to avoid misalignment during copy-by-value.
195-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
195+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
196196
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
197197
bool logits_all; // the llama_eval() call computes all logits, not just the last one
198198
bool embedding; // embedding mode only

0 commit comments

Comments
 (0)