68
68
#include " ggml-cuda.h"
69
69
#include " ggml.h"
70
70
71
+ #if defined(_MSC_VER)
72
+ #pragma warning(disable: 4244 4267) // possible loss of data
73
+ #endif
74
+
71
75
static_assert (sizeof (half) == sizeof(ggml_fp16_t ), "wrong fp16 size");
72
76
73
77
#define CUDA_CHECK (err ) \
@@ -1518,19 +1522,13 @@ static void * g_scratch_buffer = nullptr;
1518
1522
static size_t g_scratch_size = 1024 *1024 *1024 ; // 1 GB by default
1519
1523
static size_t g_scratch_offset = 0 ;
1520
1524
1521
- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1522
- #define GGML_CUDA_MAX_EVENTS 64
1523
-
1524
1525
static int g_device_count = -1 ;
1525
1526
static int g_main_device = 0 ;
1526
1527
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
1527
1528
1528
1529
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
1529
1530
1530
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1531
-
1532
- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1533
- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1531
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1534
1532
1535
1533
void ggml_init_cublas () {
1536
1534
static bool initialized = false ;
@@ -1554,15 +1552,8 @@ void ggml_init_cublas() {
1554
1552
for (int id = 0 ; id < g_device_count; ++id) {
1555
1553
CUDA_CHECK (cudaSetDevice (id));
1556
1554
1557
- // create streams
1558
- for (int i = 0 ; i < GGML_CUDA_MAX_STREAMS; ++i) {
1559
- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1560
- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1561
- }
1562
- // create events
1563
- for (int i = 0 ; i < GGML_CUDA_MAX_EVENTS; ++i) {
1564
- CUDA_CHECK (cudaEventCreateWithFlags (&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1565
- }
1555
+ // create main stream
1556
+ CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id], cudaStreamNonBlocking));
1566
1557
1567
1558
// create cublas handle
1568
1559
CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
@@ -2029,6 +2020,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2029
2020
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0 };
2030
2021
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
2031
2022
2023
+ // if multiple GPUs are used they need to wait for the main GPU to finish
2024
+ if (split && g_device_count > 1 ) {
2025
+ CUDA_CHECK (cudaSetDevice (g_main_device));
2026
+ CUDA_CHECK (cudaDeviceSynchronize ());
2027
+ }
2028
+
2032
2029
for (int id = 0 ; id < g_device_count; ++id) {
2033
2030
if (!split && id != g_main_device) {
2034
2031
continue ;
@@ -2127,9 +2124,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2127
2124
}
2128
2125
const int64_t i11 = i13*ne12 + i12;
2129
2126
2130
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
2131
- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
2132
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2127
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2133
2128
2134
2129
// for split tensors the data begins at i0 == i0_offset_low
2135
2130
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -2157,14 +2152,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2157
2152
if (src1->backend == GGML_BACKEND_CPU) {
2158
2153
GGML_ASSERT (!flatten_rows || nrows0 == ggml_nrows (src1));
2159
2154
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2160
- CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_memcpy_src1 ));
2155
+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_main ));
2161
2156
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
2162
2157
if (id != g_main_device) {
2163
2158
GGML_ASSERT (!flatten_rows);
2164
2159
float * src1_ddf_i_source = (float *) src1_extra->data_device [g_main_device];
2165
2160
src1_ddf_i_source += i11*src1_stride;
2166
2161
CUDA_CHECK (cudaMemcpyAsync (src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof (float ),
2167
- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1 ));
2162
+ cudaMemcpyDeviceToDevice, cudaStream_main ));
2168
2163
}
2169
2164
} else if (src1_on_device && !src1_is_contiguous) {
2170
2165
GGML_ASSERT (!split);
@@ -2173,7 +2168,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2173
2168
GGML_ASSERT (false );
2174
2169
}
2175
2170
}
2176
- CUDA_CHECK (cudaEventRecord (cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
2177
2171
2178
2172
if (!src0_on_device || !src0_is_contiguous) {
2179
2173
if (src0_is_f32) {
@@ -2189,9 +2183,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2189
2183
CUDA_CHECK (cudaGetLastError ());
2190
2184
}
2191
2185
2192
- // wait with main stream until src1 memcpy is done
2193
- CUDA_CHECK (cudaStreamWaitEvent (cudaStream_main, cudaEvent_memcpy_src1, 0 ));
2194
-
2195
2186
// do the computation
2196
2187
op (src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2197
2188
@@ -2229,8 +2220,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2229
2220
2230
2221
// wait until each device is finished, then free their buffers
2231
2222
for (int id = 0 ; id < g_device_count; ++id) {
2223
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0 ) {
2224
+ continue ;
2225
+ }
2226
+
2232
2227
CUDA_CHECK (cudaSetDevice (id));
2233
2228
CUDA_CHECK (cudaDeviceSynchronize ());
2229
+
2234
2230
if (src0_asq[id] > 0 ) {
2235
2231
ggml_cuda_pool_free (src0_ddq[id], src0_asq[id]);
2236
2232
}
@@ -2296,7 +2292,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2296
2292
const int64_t ne02 = src0->ne [2 ];
2297
2293
2298
2294
CUDA_CHECK (cudaSetDevice (g_main_device));
2299
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2295
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2300
2296
2301
2297
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2302
2298
void * src0_ddq = src0_extra->data_device [g_main_device];
@@ -2308,8 +2304,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2308
2304
float * dst_ddf = (float *) dst_extra->data_device [g_main_device];
2309
2305
2310
2306
ggml_mul_mat_p021_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2311
-
2312
- CUDA_CHECK (cudaDeviceSynchronize ());
2313
2307
}
2314
2308
2315
2309
void ggml_cuda_mul_mat_vec_nc (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2327,7 +2321,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2327
2321
const int64_t nb02 = src0->nb [2 ];
2328
2322
2329
2323
CUDA_CHECK (cudaSetDevice (g_main_device));
2330
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2324
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2331
2325
2332
2326
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2333
2327
void * src0_ddq = src0_extra->data_device [g_main_device];
@@ -2342,8 +2336,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2342
2336
const int channel_stride_x = nb02 / sizeof (half);
2343
2337
2344
2338
ggml_mul_mat_vec_nc_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2345
-
2346
- CUDA_CHECK (cudaDeviceSynchronize ());
2347
2339
}
2348
2340
2349
2341
void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2399,7 +2391,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2399
2391
const int64_t nb12 = src1->nb [2 ];
2400
2392
2401
2393
CUDA_CHECK (cudaSetDevice (g_main_device));
2402
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2394
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2403
2395
2404
2396
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2405
2397
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra ;
@@ -2417,8 +2409,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2417
2409
GGML_ASSERT (false );
2418
2410
}
2419
2411
2420
- CUDA_CHECK (cudaDeviceSynchronize ());
2421
-
2422
2412
(void ) dst;
2423
2413
}
2424
2414
0 commit comments