@@ -4293,45 +4293,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
4293
4293
scoped_spin_lock lock (g_cuda_pool_lock);
4294
4294
int id;
4295
4295
CUDA_CHECK (cudaGetDevice (&id));
4296
-
4297
- int best_i = - 1 ;
4298
- size_t best_size = std::numeric_limits< size_t >:: max (); // smallest unused buffer that fits our needs
4299
- int worst_i = - 1 ;
4300
- size_t worst_size = 0 ; // largest unused buffer seen so far
4301
-
4296
+ # ifdef DEBUG_CUDA_MALLOC
4297
+ int nnz = 0 ;
4298
+ size_t max_size = 0 , tot_size = 0 ;
4299
+ # endif
4300
+ size_t best_diff = 1ull << 36 ;
4301
+ int ibest = - 1 ;
4302
4302
for (int i = 0 ; i < MAX_CUDA_BUFFERS; ++i) {
4303
4303
cuda_buffer& b = g_cuda_buffer_pool[id][i];
4304
- if (b.size > 0 && b.size >= size && b.size < best_size)
4305
- {
4306
- best_i = i;
4307
- best_size = b.size ;
4308
- }
4309
- if (b.size > 0 && b.size > worst_size)
4310
- {
4311
- worst_i = i;
4312
- worst_size = b.size ;
4304
+ if (b.ptr != nullptr ) {
4305
+ #ifdef DEBUG_CUDA_MALLOC
4306
+ ++nnz;
4307
+ tot_size += b.size ;
4308
+ if (b.size > max_size) max_size = b.size ;
4309
+ #endif
4310
+ if (b.size >= size) {
4311
+ size_t diff = b.size - size;
4312
+ if (diff < best_diff) {
4313
+ best_diff = diff;
4314
+ ibest = i;
4315
+ if (!best_diff) {
4316
+ void * ptr = b.ptr ;
4317
+ *actual_size = b.size ;
4318
+ b.ptr = nullptr ;
4319
+ b.size = 0 ;
4320
+ return ptr;
4321
+ }
4322
+ }
4323
+ }
4313
4324
}
4314
4325
}
4315
- if (best_i!=-1 ) // found the smallest buffer that fits our needs
4316
- {
4317
- cuda_buffer& b = g_cuda_buffer_pool[id][best_i];
4326
+ if (ibest >= 0 ) {
4327
+ cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
4318
4328
void * ptr = b.ptr ;
4319
4329
*actual_size = b.size ;
4320
4330
b.ptr = nullptr ;
4321
4331
b.size = 0 ;
4322
4332
return ptr;
4323
4333
}
4324
- if (worst_i!=-1 ) // no buffer that fits our needs, resize largest one to save memory
4325
- {
4326
- cuda_buffer& b = g_cuda_buffer_pool[id][worst_i];
4327
- b.size = 0 ;
4328
- void * ptr = b.ptr ;
4329
- cudaFree (ptr);
4330
- b.ptr = ptr = nullptr ;
4331
- }
4334
+ #ifdef DEBUG_CUDA_MALLOC
4335
+ fprintf (stderr, " %s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n " , __func__, nnz,
4336
+ (uint32_t )(max_size/1024 /1024 ), (uint32_t )(tot_size/1024 /1024 ), (uint32_t )(size/1024 /1024 ));
4337
+ #endif
4332
4338
void * ptr;
4333
- CUDA_CHECK (cudaMalloc ((void **) &ptr, size));
4334
- *actual_size = size;
4339
+ size_t look_ahead_size = (size_t ) (1.05 * size);
4340
+ look_ahead_size = 256 * ((look_ahead_size + 255 )/256 );
4341
+ CUDA_CHECK (cudaMalloc ((void **) &ptr, look_ahead_size));
4342
+ *actual_size = look_ahead_size;
4335
4343
return ptr;
4336
4344
}
4337
4345
0 commit comments