Skip to content

Commit cae6a84

Browse files
committed
cuda free only for non mmq (+2 squashed commit)
Squashed commit: [3aca763] only cuda free for non mmq [e69a8c9] revert to pool alloc to try again
1 parent 9f16a4c commit cae6a84

File tree

2 files changed

+31
-35
lines changed

2 files changed

+31
-35
lines changed

ggml-cuda.cu

+30-34
Original file line numberDiff line numberDiff line change
@@ -4288,58 +4288,55 @@ struct cuda_buffer {
42884288

42894289
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
42904290
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
4291+
static bool g_mul_mat_q = false;
42914292

42924293
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
42934294
scoped_spin_lock lock(g_cuda_pool_lock);
42944295
int id;
42954296
CUDA_CHECK(cudaGetDevice(&id));
4296-
#ifdef DEBUG_CUDA_MALLOC
4297-
int nnz = 0;
4298-
size_t max_size = 0, tot_size = 0;
4299-
#endif
4300-
size_t best_diff = 1ull << 36;
4301-
int ibest = -1;
4297+
4298+
int best_i = -1;
4299+
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
4300+
int worst_i = -1;
4301+
size_t worst_size = 0; //largest unused buffer seen so far
4302+
43024303
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
43034304
cuda_buffer& b = g_cuda_buffer_pool[id][i];
4304-
if (b.ptr != nullptr) {
4305-
#ifdef DEBUG_CUDA_MALLOC
4306-
++nnz;
4307-
tot_size += b.size;
4308-
if (b.size > max_size) max_size = b.size;
4309-
#endif
4310-
if (b.size >= size) {
4311-
size_t diff = b.size - size;
4312-
if (diff < best_diff) {
4313-
best_diff = diff;
4314-
ibest = i;
4315-
if (!best_diff) {
4316-
void * ptr = b.ptr;
4317-
*actual_size = b.size;
4318-
b.ptr = nullptr;
4319-
b.size = 0;
4320-
return ptr;
4321-
}
4322-
}
4323-
}
4305+
if (b.size > 0 && b.size >= size && b.size < best_size)
4306+
{
4307+
best_i = i;
4308+
best_size = b.size;
4309+
}
4310+
if (b.size > 0 && b.size > worst_size)
4311+
{
4312+
worst_i = i;
4313+
worst_size = b.size;
43244314
}
43254315
}
4326-
if (ibest >= 0) {
4327-
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
4316+
if(best_i!=-1) //found the smallest buffer that fits our needs
4317+
{
4318+
cuda_buffer& b = g_cuda_buffer_pool[id][best_i];
43284319
void * ptr = b.ptr;
43294320
*actual_size = b.size;
43304321
b.ptr = nullptr;
43314322
b.size = 0;
43324323
return ptr;
43334324
}
4334-
#ifdef DEBUG_CUDA_MALLOC
4335-
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
4336-
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
4337-
#endif
4325+
if(worst_i!=-1 && !g_mul_mat_q) //no buffer that fits our needs, resize largest one to save memory (non mmq only)
4326+
{
4327+
cuda_buffer& b = g_cuda_buffer_pool[id][worst_i];
4328+
b.size = 0;
4329+
void * ptr = b.ptr;
4330+
cudaFree(ptr);
4331+
b.ptr = ptr = nullptr;
4332+
}
43384333
void * ptr;
4339-
size_t look_ahead_size = (size_t) (1.05 * size);
4334+
4335+
size_t look_ahead_size = (size_t) (1.02 * size);
43404336
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
43414337
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
43424338
*actual_size = look_ahead_size;
4339+
43434340
return ptr;
43444341
}
43454342

@@ -4369,7 +4366,6 @@ static int g_device_count = -1;
43694366
static int g_main_device = 0;
43704367
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
43714368
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
4372-
static bool g_mul_mat_q = false;
43734369

43744370
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
43754371

koboldcpp.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ def utfprint(str):
303303
maxhordelen = 256
304304
modelbusy = threading.Lock()
305305
defaultport = 5001
306-
KcppVersion = "1.39"
306+
KcppVersion = "1.39.1"
307307
showdebug = True
308308
showsamplerwarning = True
309309
showmaxctxwarning = True

0 commit comments

Comments
 (0)