@@ -4288,58 +4288,55 @@ struct cuda_buffer {
4288
4288
4289
4289
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
4290
4290
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
4291
+ static bool g_mul_mat_q = false ;
4291
4292
4292
4293
static void * ggml_cuda_pool_malloc (size_t size, size_t * actual_size) {
4293
4294
scoped_spin_lock lock (g_cuda_pool_lock);
4294
4295
int id;
4295
4296
CUDA_CHECK (cudaGetDevice (&id));
4296
- # ifdef DEBUG_CUDA_MALLOC
4297
- int nnz = 0 ;
4298
- size_t max_size = 0 , tot_size = 0 ;
4299
- # endif
4300
- size_t best_diff = 1ull << 36 ;
4301
- int ibest = - 1 ;
4297
+
4298
+ int best_i = - 1 ;
4299
+ size_t best_size = std::numeric_limits< size_t >:: max (); // smallest unused buffer that fits our needs
4300
+ int worst_i = - 1 ;
4301
+ size_t worst_size = 0 ; // largest unused buffer seen so far
4302
+
4302
4303
for (int i = 0 ; i < MAX_CUDA_BUFFERS; ++i) {
4303
4304
cuda_buffer& b = g_cuda_buffer_pool[id][i];
4304
- if (b.ptr != nullptr ) {
4305
- #ifdef DEBUG_CUDA_MALLOC
4306
- ++nnz;
4307
- tot_size += b.size ;
4308
- if (b.size > max_size) max_size = b.size ;
4309
- #endif
4310
- if (b.size >= size) {
4311
- size_t diff = b.size - size;
4312
- if (diff < best_diff) {
4313
- best_diff = diff;
4314
- ibest = i;
4315
- if (!best_diff) {
4316
- void * ptr = b.ptr ;
4317
- *actual_size = b.size ;
4318
- b.ptr = nullptr ;
4319
- b.size = 0 ;
4320
- return ptr;
4321
- }
4322
- }
4323
- }
4305
+ if (b.size > 0 && b.size >= size && b.size < best_size)
4306
+ {
4307
+ best_i = i;
4308
+ best_size = b.size ;
4309
+ }
4310
+ if (b.size > 0 && b.size > worst_size)
4311
+ {
4312
+ worst_i = i;
4313
+ worst_size = b.size ;
4324
4314
}
4325
4315
}
4326
- if (ibest >= 0 ) {
4327
- cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
4316
+ if (best_i!=-1 ) // found the smallest buffer that fits our needs
4317
+ {
4318
+ cuda_buffer& b = g_cuda_buffer_pool[id][best_i];
4328
4319
void * ptr = b.ptr ;
4329
4320
*actual_size = b.size ;
4330
4321
b.ptr = nullptr ;
4331
4322
b.size = 0 ;
4332
4323
return ptr;
4333
4324
}
4334
- #ifdef DEBUG_CUDA_MALLOC
4335
- fprintf (stderr, " %s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n " , __func__, nnz,
4336
- (uint32_t )(max_size/1024 /1024 ), (uint32_t )(tot_size/1024 /1024 ), (uint32_t )(size/1024 /1024 ));
4337
- #endif
4325
+ if (worst_i!=-1 && !g_mul_mat_q) // no buffer that fits our needs, resize largest one to save memory (non mmq only)
4326
+ {
4327
+ cuda_buffer& b = g_cuda_buffer_pool[id][worst_i];
4328
+ b.size = 0 ;
4329
+ void * ptr = b.ptr ;
4330
+ cudaFree (ptr);
4331
+ b.ptr = ptr = nullptr ;
4332
+ }
4338
4333
void * ptr;
4339
- size_t look_ahead_size = (size_t ) (1.05 * size);
4334
+
4335
+ size_t look_ahead_size = (size_t ) (1.02 * size);
4340
4336
look_ahead_size = 256 * ((look_ahead_size + 255 )/256 );
4341
4337
CUDA_CHECK (cudaMalloc ((void **) &ptr, look_ahead_size));
4342
4338
*actual_size = look_ahead_size;
4339
+
4343
4340
return ptr;
4344
4341
}
4345
4342
@@ -4369,7 +4366,6 @@ static int g_device_count = -1;
4369
4366
static int g_main_device = 0 ;
4370
4367
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
4371
4368
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
4372
- static bool g_mul_mat_q = false ;
4373
4369
4374
4370
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
4375
4371
0 commit comments