@@ -29,7 +29,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
29
29
} \
30
30
} while (0 )
31
31
32
- #if CUDART_VERSION >= 12000
32
+ #if CUDART_VERSION >= 12
33
33
#define CUBLAS_CHECK (err ) \
34
34
do { \
35
35
cublasStatus_t err_ = (err); \
@@ -1503,13 +1503,19 @@ static void * g_scratch_buffer = nullptr;
1503
1503
static size_t g_scratch_size = 1024 *1024 *1024 ; // 1 GB by default
1504
1504
static size_t g_scratch_offset = 0 ;
1505
1505
1506
+ #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1507
+ #define GGML_CUDA_MAX_EVENTS 64
1508
+
1506
1509
// Note: tensor_split defines the breakpoints of tensors that can be split {0,0.5}
1507
1510
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
1508
1511
static GPUStatus g_system_gpu_status;
1509
1512
1510
1513
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
1511
1514
1512
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1515
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1516
+
1517
+ static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1518
+ static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1513
1519
1514
1520
// Todo verify: free and total memory reported by cudaMemGetInfo differs from gpu_z which also differs from hwinfo64.
1515
1521
// Update the system status about available GPUs and memory usage
@@ -1636,16 +1642,29 @@ void ggml_init_cublas() {
1636
1642
}
1637
1643
}
1638
1644
ggml_cuda_print_gpu_status (&g_system_gpu_status);
1645
+ printf (" Preparing CUDA for device(s): \n " );
1639
1646
for (int id = 0 ; id < g_system_gpu_status.num_devices ; ++id) {
1647
+ printf (" [%d]" , id);
1640
1648
CUDA_CHECK (cudaSetDevice (id));
1641
1649
1642
- // create main stream
1643
- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id], cudaStreamNonBlocking));
1650
+ // create streams
1651
+ for (int i = 0 ; i < GGML_CUDA_MAX_STREAMS; ++i) {
1652
+ CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1653
+ CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1654
+ }
1655
+ printf (" ." );
1656
+ // create events
1657
+ for (int i = 0 ; i < GGML_CUDA_MAX_EVENTS; ++i) {
1658
+ CUDA_CHECK (cudaEventCreateWithFlags (&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1659
+ }
1660
+ printf (" ." );
1644
1661
1645
1662
// create cublas handle
1646
1663
CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
1647
1664
CUBLAS_CHECK (cublasSetMathMode (g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
1665
+ printf (" ." );
1648
1666
}
1667
+ printf (" [done]\n " );
1649
1668
CUDA_CHECK (cudaSetDevice (currentDevice));
1650
1669
1651
1670
// configure logging to stdout
@@ -2124,12 +2143,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2124
2143
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0 };
2125
2144
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
2126
2145
2127
- // if multiple GPUs are used they need to wait for the main GPU to finish
2128
- if (split && g_system_gpu_status.num_devices > 1 ) {
2129
- CUDA_CHECK (cudaSetDevice (g_system_gpu_status.main_device_id ));
2130
- CUDA_CHECK (cudaDeviceSynchronize ());
2131
- }
2132
-
2133
2146
for (int id = 0 ; id < g_system_gpu_status.num_devices ; ++id) {
2134
2147
if (!split && id != g_system_gpu_status.main_device_id ) {
2135
2148
continue ;
@@ -2228,7 +2241,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2228
2241
}
2229
2242
const int64_t i11 = i13*ne12 + i12;
2230
2243
2231
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2244
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
2245
+ cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
2246
+ cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2232
2247
2233
2248
// for split tensors the data begins at i0 == i0_offset_low
2234
2249
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -2256,14 +2271,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2256
2271
if (src1->backend == GGML_BACKEND_CPU) {
2257
2272
GGML_ASSERT (!flatten_rows || nrows0 == ggml_nrows (src1));
2258
2273
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2259
- CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_main ));
2274
+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_memcpy_src1 ));
2260
2275
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
2261
2276
if (id != g_system_gpu_status.main_device_id ) {
2262
2277
GGML_ASSERT (!flatten_rows);
2263
2278
float * src1_ddf_i_source = (float *) src1_extra->data_device [g_system_gpu_status.main_device_id ];
2264
2279
src1_ddf_i_source += i11*src1_stride;
2265
2280
CUDA_CHECK (cudaMemcpyAsync (src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof (float ),
2266
- cudaMemcpyDeviceToDevice, cudaStream_main ));
2281
+ cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1 ));
2267
2282
}
2268
2283
} else if (src1_on_device && !src1_is_contiguous) {
2269
2284
GGML_ASSERT (!split);
@@ -2272,6 +2287,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2272
2287
GGML_ASSERT (false );
2273
2288
}
2274
2289
}
2290
+ CUDA_CHECK (cudaEventRecord (cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
2275
2291
2276
2292
if (!src0_on_device || !src0_is_contiguous) {
2277
2293
if (src0_is_f32) {
@@ -2287,6 +2303,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2287
2303
CUDA_CHECK (cudaGetLastError ());
2288
2304
}
2289
2305
2306
+ // wait with main stream until src1 memcpy is done
2307
+ CUDA_CHECK (cudaStreamWaitEvent (cudaStream_main, cudaEvent_memcpy_src1, 0 ));
2308
+
2290
2309
// do the computation
2291
2310
op (src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2292
2311
@@ -2396,7 +2415,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2396
2415
const int64_t ne02 = src0->ne [2 ];
2397
2416
2398
2417
CUDA_CHECK (cudaSetDevice (g_system_gpu_status.main_device_id ));
2399
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id ];
2418
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id ][ 0 ] ;
2400
2419
2401
2420
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2402
2421
void * src0_ddq = src0_extra->data_device [g_system_gpu_status.main_device_id ];
@@ -2408,6 +2427,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2408
2427
float * dst_ddf = (float *) dst_extra->data_device [g_system_gpu_status.main_device_id ];
2409
2428
2410
2429
ggml_mul_mat_p021_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2430
+
2431
+ CUDA_CHECK (cudaDeviceSynchronize ());
2411
2432
}
2412
2433
2413
2434
void ggml_cuda_mul_mat_vec_nc (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2425,7 +2446,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2425
2446
const int64_t nb02 = src0->nb [2 ];
2426
2447
2427
2448
CUDA_CHECK (cudaSetDevice (g_system_gpu_status.main_device_id ));
2428
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id ];
2449
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id ][ 0 ] ;
2429
2450
2430
2451
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2431
2452
void * src0_ddq = src0_extra->data_device [g_system_gpu_status.main_device_id ];
@@ -2440,6 +2461,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2440
2461
const int channel_stride_x = nb02 / sizeof (half);
2441
2462
2442
2463
ggml_mul_mat_vec_nc_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2464
+
2465
+ CUDA_CHECK (cudaDeviceSynchronize ());
2443
2466
}
2444
2467
2445
2468
void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2495,7 +2518,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2495
2518
const int64_t nb12 = src1->nb [2 ];
2496
2519
2497
2520
CUDA_CHECK (cudaSetDevice (g_system_gpu_status.main_device_id ));
2498
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id ];
2521
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_system_gpu_status.main_device_id ][ 0 ] ;
2499
2522
2500
2523
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2501
2524
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra ;
@@ -2513,6 +2536,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2513
2536
GGML_ASSERT (false );
2514
2537
}
2515
2538
2539
+ CUDA_CHECK (cudaDeviceSynchronize ());
2540
+
2516
2541
(void ) dst;
2517
2542
}
2518
2543
0 commit comments