Skip to content

Commit d896ebf

Browse files
authored
Adapted the pull 1862 from ikawrakow for ggllm.cpp (ggml-org#23)
Performance increase of 6-7% on K-type quants (40B model only)
1 parent f6ba918 commit d896ebf

File tree

2 files changed

+45
-17
lines changed

2 files changed

+45
-17
lines changed

ggml-cuda.cu

+41-16
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
2929
} \
3030
} while (0)
3131

32-
#if CUDART_VERSION >= 12000
32+
#if CUDART_VERSION >= 12
3333
#define CUBLAS_CHECK(err) \
3434
do { \
3535
cublasStatus_t err_ = (err); \
@@ -1503,13 +1503,19 @@ static void * g_scratch_buffer = nullptr;
15031503
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
15041504
static size_t g_scratch_offset = 0;
15051505

1506+
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1507+
#define GGML_CUDA_MAX_EVENTS 64
1508+
15061509
// Note: tensor_split defines the breakpoints of tensors that can be split {0,0.5}
15071510
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
15081511
static GPUStatus g_system_gpu_status;
15091512

15101513
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
15111514

1512-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1515+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1516+
1517+
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1518+
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
15131519

15141520
// Todo verify: free and total memory reported by cudaMemGetInfo differs from gpu_z which also differs from hwinfo64.
15151521
// Update the system status about available GPUs and memory usage
@@ -1636,16 +1642,29 @@ void ggml_init_cublas() {
16361642
}
16371643
}
16381644
ggml_cuda_print_gpu_status(&g_system_gpu_status);
1645+
printf("Preparing CUDA for device(s): \n");
16391646
for (int id = 0; id < g_system_gpu_status.num_devices; ++id) {
1647+
printf("[%d]", id);
16401648
CUDA_CHECK(cudaSetDevice(id));
16411649

1642-
// create main stream
1643-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
1650+
// create streams
1651+
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
1652+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1653+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1654+
}
1655+
printf(".");
1656+
// create events
1657+
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
1658+
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1659+
}
1660+
printf(".");
16441661

16451662
// create cublas handle
16461663
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
16471664
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
1665+
printf(".");
16481666
}
1667+
printf(" [done]\n");
16491668
CUDA_CHECK(cudaSetDevice(currentDevice));
16501669

16511670
// configure logging to stdout
@@ -2124,12 +2143,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21242143
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
21252144
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
21262145

2127-
// if multiple GPUs are used they need to wait for the main GPU to finish
2128-
if (split && g_system_gpu_status.num_devices > 1) {
2129-
CUDA_CHECK(cudaSetDevice(g_system_gpu_status.main_device_id));
2130-
CUDA_CHECK(cudaDeviceSynchronize());
2131-
}
2132-
21332146
for (int id = 0; id < g_system_gpu_status.num_devices; ++id) {
21342147
if (!split && id != g_system_gpu_status.main_device_id) {
21352148
continue;
@@ -2228,7 +2241,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
22282241
}
22292242
const int64_t i11 = i13*ne12 + i12;
22302243

2231-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2244+
cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
2245+
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
2246+
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
22322247

22332248
// for split tensors the data begins at i0 == i0_offset_low
22342249
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -2256,14 +2271,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
22562271
if (src1->backend == GGML_BACKEND_CPU) {
22572272
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
22582273
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2259-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
2274+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
22602275
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
22612276
if (id != g_system_gpu_status.main_device_id) {
22622277
GGML_ASSERT(!flatten_rows);
22632278
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_system_gpu_status.main_device_id];
22642279
src1_ddf_i_source += i11*src1_stride;
22652280
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
2266-
cudaMemcpyDeviceToDevice, cudaStream_main));
2281+
cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
22672282
}
22682283
} else if (src1_on_device && !src1_is_contiguous) {
22692284
GGML_ASSERT(!split);
@@ -2272,6 +2287,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
22722287
GGML_ASSERT(false);
22732288
}
22742289
}
2290+
CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
22752291

22762292
if (!src0_on_device || !src0_is_contiguous) {
22772293
if (src0_is_f32) {
@@ -2287,6 +2303,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
22872303
CUDA_CHECK(cudaGetLastError());
22882304
}
22892305

2306+
// wait with main stream until src1 memcpy is done
2307+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
2308+
22902309
// do the computation
22912310
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
22922311

@@ -2396,7 +2415,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
23962415
const int64_t ne02 = src0->ne[2];
23972416

23982417
CUDA_CHECK(cudaSetDevice(g_system_gpu_status.main_device_id));
2399-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id];
2418+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id][0];
24002419

24012420
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
24022421
void * src0_ddq = src0_extra->data_device[g_system_gpu_status.main_device_id];
@@ -2408,6 +2427,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
24082427
float * dst_ddf = (float *) dst_extra->data_device[g_system_gpu_status.main_device_id];
24092428

24102429
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2430+
2431+
CUDA_CHECK(cudaDeviceSynchronize());
24112432
}
24122433

24132434
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2425,7 +2446,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
24252446
const int64_t nb02 = src0->nb[2];
24262447

24272448
CUDA_CHECK(cudaSetDevice(g_system_gpu_status.main_device_id));
2428-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id];
2449+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id][0];
24292450

24302451
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
24312452
void * src0_ddq = src0_extra->data_device[g_system_gpu_status.main_device_id];
@@ -2440,6 +2461,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
24402461
const int channel_stride_x = nb02 / sizeof(half);
24412462

24422463
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2464+
2465+
CUDA_CHECK(cudaDeviceSynchronize());
24432466
}
24442467

24452468
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2495,7 +2518,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
24952518
const int64_t nb12 = src1->nb[2];
24962519

24972520
CUDA_CHECK(cudaSetDevice(g_system_gpu_status.main_device_id));
2498-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id];
2521+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id][0];
24992522

25002523
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
25012524
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -2513,6 +2536,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
25132536
GGML_ASSERT(false);
25142537
}
25152538

2539+
CUDA_CHECK(cudaDeviceSynchronize());
2540+
25162541
(void) dst;
25172542
}
25182543

libfalcon.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -1408,7 +1408,10 @@ static void falcon_model_load_internal(
14081408
}
14091409
#endif
14101410
*/
1411-
1411+
if (progress_callback) {
1412+
progress_callback(0.01f, progress_callback_user_data,"Loading weights");
1413+
}
1414+
14121415
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
14131416

14141417
if (progress_callback) {

0 commit comments

Comments
 (0)