Skip to content

Commit 5bf3953

Browse files
cuda : improve cuda pool efficiency using virtual memory (#4606)
* cuda : improve cuda pool efficiency using virtual memory * fix mixtral * fix cmake build * check for vmm support, disable for hip ggml-ci * fix hip build * clarify granularity * move all caps to g_device_caps * refactor error checking * add cuda_pool_alloc, refactor most pool allocations ggml-ci * fix hip build * CUBLAS_TF32_TENSOR_OP_MATH is not a macro * more hip crap * llama : fix msvc warnings * ggml : fix msvc warnings * minor * minor * cuda : fallback to CPU on host buffer alloc fail * Update ggml-cuda.cu Co-authored-by: Johannes Gäßler <[email protected]> * Update ggml-cuda.cu Co-authored-by: Johannes Gäßler <[email protected]> * ensure allocations are always aligned * act_size -> actual_size --------- Co-authored-by: Johannes Gäßler <[email protected]>
1 parent 708e179 commit 5bf3953

File tree

8 files changed

+328
-208
lines changed

8 files changed

+328
-208
lines changed

CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,8 @@ if (LLAMA_CUBLAS)
302302
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
303303
endif()
304304

305+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
306+
305307
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
306308
# 52 == lowest CUDA 12 standard
307309
# 60 == f16 CUDA intrinsics

Makefile

+2-4
Original file line numberDiff line numberDiff line change
@@ -367,17 +367,15 @@ endif # LLAMA_BLIS
367367

368368
ifdef LLAMA_CUBLAS
369369
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
370-
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib
370+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
371371
OBJS += ggml-cuda.o
372372
MK_NVCCFLAGS = -use_fast_math
373373
ifndef JETSON_EOL_MODULE_DETECT
374374
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
375375
endif # JETSON_EOL_MODULE_DETECT
376-
377376
ifdef LLAMA_DEBUG
378377
MK_NVCCFLAGS += -lineinfo
379-
endif
380-
378+
endif # LLAMA_DEBUG
381379
ifdef LLAMA_CUDA_NVCC
382380
NVCC = $(LLAMA_CUDA_NVCC)
383381
else

ggml-backend.c

+6-10
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ static void ggml_backend_registry_init(void) {
297297
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
298298
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
299299

300-
int id = ggml_backend_registry_count;
300+
size_t id = ggml_backend_registry_count;
301301

302302
ggml_backend_registry[id] = (struct ggml_backend_reg) {
303303
/* .name = */ {0},
@@ -330,6 +330,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
330330
return i;
331331
}
332332
}
333+
334+
// not found
333335
return SIZE_MAX;
334336
}
335337

@@ -340,15 +342,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
340342
const char * params = strchr(backend_str, ':');
341343
char backend_name[128];
342344
if (params == NULL) {
343-
strcpy(backend_name, backend_str);
345+
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
344346
params = "";
345347
} else {
346-
strncpy(backend_name, backend_str, params - backend_str);
347-
backend_name[params - backend_str] = '\0';
348+
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
348349
params++;
349350
}
350351

351352
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
353+
352354
if (backend_i == SIZE_MAX) {
353355
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
354356
return NULL;
@@ -396,18 +398,12 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
396398
}
397399

398400
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
399-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
400-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
401-
402401
memcpy((char *)tensor->data + offset, data, size);
403402

404403
GGML_UNUSED(buffer);
405404
}
406405

407406
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
408-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
409-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
410-
411407
memcpy(data, (const char *)tensor->data + offset, size);
412408

413409
GGML_UNUSED(buffer);

0 commit comments

Comments
 (0)