@@ -215,6 +215,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
215
215
static_assert (K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2 , " K_QUANTS_PER_ITERATION must be 1 or 2" );
216
216
#endif
217
217
218
+ struct ggml_tensor_extra_gpu {
219
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
220
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
221
+ };
222
+
218
223
static __global__ void add_f32 (const float * x, const float * y, float * dst, const int k) {
219
224
const int i = blockDim .x *blockIdx .x + threadIdx .x ;
220
225
@@ -1996,7 +2001,6 @@ inline void ggml_cuda_op_add(
1996
2001
} else {
1997
2002
GGML_ASSERT (false );
1998
2003
}
1999
- CUDA_CHECK (cudaGetLastError ());
2000
2004
2001
2005
(void ) src1;
2002
2006
(void ) dst;
@@ -2028,7 +2032,6 @@ inline void ggml_cuda_op_mul(
2028
2032
2029
2033
// compute
2030
2034
mul_f32_cuda (src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2031
- CUDA_CHECK (cudaGetLastError ());
2032
2035
}
2033
2036
2034
2037
(void ) dst;
@@ -2049,7 +2052,6 @@ inline void ggml_cuda_op_silu(
2049
2052
2050
2053
// compute
2051
2054
silu_f32_cuda (src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2052
- CUDA_CHECK (cudaGetLastError ());
2053
2055
2054
2056
(void ) src1;
2055
2057
(void ) dst;
@@ -2072,7 +2074,6 @@ inline void ggml_cuda_op_rms_norm(
2072
2074
2073
2075
// compute
2074
2076
rms_norm_f32_cuda (src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2075
- CUDA_CHECK (cudaGetLastError ());
2076
2077
2077
2078
(void ) src1;
2078
2079
(void ) dst;
@@ -2151,7 +2152,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
2151
2152
GGML_ASSERT (false );
2152
2153
break ;
2153
2154
}
2154
- CUDA_CHECK (cudaGetLastError ());
2155
2155
2156
2156
#ifdef GGML_CUDA_DMMV_F16
2157
2157
if (src1_convert_f16) {
@@ -2224,14 +2224,13 @@ inline void ggml_cuda_op_rope(
2224
2224
const int n_ctx = ((int32_t *) src1->data )[3 ];
2225
2225
GGML_ASSERT (mode == 0 );
2226
2226
2227
- const float theta_scale = powf ( 10000.0 , - 2 . 0f / n_dims);
2227
+ const float theta_scale = get_theta_scale ( n_dims,n_past,n_ctx );
2228
2228
const float p0 = ((mode & 1 ) == 0 ? n_past + i02 : i02);
2229
2229
2230
- const float p = n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx ;
2230
+ const float p = p0 ;
2231
2231
2232
2232
// compute
2233
2233
rope_f32_cuda (src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
2234
- CUDA_CHECK (cudaGetLastError ());
2235
2234
2236
2235
(void ) dst;
2237
2236
(void ) src0_ddq_i;
@@ -2255,7 +2254,6 @@ inline void ggml_cuda_op_diag_mask_inf(
2255
2254
2256
2255
// compute
2257
2256
diag_mask_inf_f32_cuda (src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
2258
- CUDA_CHECK (cudaGetLastError ());
2259
2257
2260
2258
(void ) dst;
2261
2259
(void ) src0_ddq_i;
@@ -2277,7 +2275,6 @@ inline void ggml_cuda_op_soft_max(
2277
2275
2278
2276
// compute
2279
2277
soft_max_f32_cuda (src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2280
- CUDA_CHECK (cudaGetLastError ());
2281
2278
2282
2279
(void ) src1;
2283
2280
(void ) dst;
@@ -2373,10 +2370,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2373
2370
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0 };
2374
2371
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
2375
2372
2376
- // if multiple GPUs are used they need to wait for the main GPU to finish
2373
+ // if multiple devices are used they need to wait for the main device
2374
+ // here an event is recorded that signifies that the main device has finished calculating the input data
2377
2375
if (split && g_device_count > 1 ) {
2378
2376
CUDA_CHECK (cudaSetDevice (g_main_device));
2379
- CUDA_CHECK (cudaDeviceSynchronize ( ));
2377
+ CUDA_CHECK (cudaEventRecord (src0_extra-> events [g_main_device], g_cudaStreams_main[g_main_device] ));
2380
2378
}
2381
2379
2382
2380
for (int id = 0 ; id < g_device_count; ++id) {
@@ -2402,6 +2400,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2402
2400
int64_t row_diff = row_high - row_low;
2403
2401
2404
2402
cudaSetDevice (id);
2403
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2404
+
2405
+ // wait for main GPU data if necessary
2406
+ if (split && id != g_main_device) {
2407
+ CUDA_CHECK (cudaStreamWaitEvent (cudaStream_main, src0_extra->events [g_main_device]));
2408
+ }
2405
2409
2406
2410
if (src0_on_device && src0_is_contiguous) {
2407
2411
if (src0_is_f32) {
@@ -2477,8 +2481,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2477
2481
}
2478
2482
const int64_t i11 = i13*ne12 + i12;
2479
2483
2480
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2481
-
2482
2484
// for split tensors the data begins at i0 == i0_offset_low
2483
2485
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2484
2486
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2538,6 +2540,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2538
2540
2539
2541
// do the computation
2540
2542
op (src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2543
+ CUDA_CHECK (cudaGetLastError ());
2541
2544
2542
2545
// copy dst to host or other device if necessary
2543
2546
if (!dst_on_device) {
@@ -2567,6 +2570,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2567
2570
CUDA_CHECK (cudaMemcpyAsync (dhf_dst_i, dst_ddf_i, dst_stride*sizeof (float ), kind, cudaStream_main));
2568
2571
}
2569
2572
}
2573
+
2574
+ // signify to main device that other device is done
2575
+ if (split && g_device_count > 1 && id != g_main_device) {
2576
+ CUDA_CHECK (cudaEventRecord (src0_extra->events [id], cudaStream_main));
2577
+ }
2570
2578
}
2571
2579
}
2572
2580
}
@@ -2578,7 +2586,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2578
2586
}
2579
2587
2580
2588
CUDA_CHECK (cudaSetDevice (id));
2581
- CUDA_CHECK (cudaDeviceSynchronize ());
2582
2589
2583
2590
if (src0_asq[id] > 0 ) {
2584
2591
ggml_cuda_pool_free (src0_ddq[id], src0_asq[id]);
@@ -2593,6 +2600,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2593
2600
ggml_cuda_pool_free (dst_ddf[id], dst_asf[id]);
2594
2601
}
2595
2602
}
2603
+
2604
+ // main device waits for all other devices to be finished
2605
+ if (split && g_device_count > 1 ) {
2606
+ CUDA_CHECK (cudaSetDevice (g_main_device));
2607
+ for (int id = 0 ; id < g_device_count; ++id) {
2608
+ if (id != g_main_device) {
2609
+ CUDA_CHECK (cudaStreamWaitEvent (g_cudaStreams_main[g_main_device], src0_extra->events [id]));
2610
+ }
2611
+ }
2612
+ }
2613
+
2614
+ if (dst->backend == GGML_BACKEND_CPU) {
2615
+ CUDA_CHECK (cudaSetDevice (g_main_device));
2616
+ CUDA_CHECK (cudaDeviceSynchronize ());
2617
+ }
2596
2618
}
2597
2619
2598
2620
void ggml_cuda_add (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2832,6 +2854,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2832
2854
cudaMemcpy (buf, buf_host, size, cudaMemcpyHostToDevice);
2833
2855
2834
2856
extra->data_device [id] = buf;
2857
+
2858
+ if (backend == GGML_BACKEND_GPU_SPLIT) {
2859
+ CUDA_CHECK (cudaEventCreateWithFlags (&extra->events [id], cudaEventDisableTiming));
2860
+ }
2835
2861
}
2836
2862
2837
2863
tensor->extra = extra;
@@ -2845,12 +2871,15 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2845
2871
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra ;
2846
2872
2847
2873
for (int id = 0 ; id < g_device_count; ++id) {
2848
- if (extra->data_device [id] == nullptr ) {
2849
- continue ;
2874
+ if (extra->data_device [id] != nullptr ) {
2875
+ CUDA_CHECK (cudaSetDevice (id));
2876
+ CUDA_CHECK (cudaFree (extra->data_device [id]));
2850
2877
}
2851
2878
2852
- CUDA_CHECK (cudaSetDevice (id));
2853
- CUDA_CHECK (cudaFree (extra->data_device [id]));
2879
+ if (extra->events [id] != nullptr ) {
2880
+ CUDA_CHECK (cudaSetDevice (id));
2881
+ CUDA_CHECK (cudaEventDestroy (extra->events [id]));
2882
+ }
2854
2883
}
2855
2884
2856
2885
delete extra;
0 commit comments