From 26235eda50119df1059b690d34585247464a7e2b Mon Sep 17 00:00:00 2001 From: "Y. Velkov" Date: Wed, 15 May 2024 11:57:44 +0300 Subject: [PATCH 1/6] logging: output capture in cuda module --- ggml-cuda.cu | 105 ++++++++++++++++++++++++++++++++++++--------------- ggml-cuda.h | 1 + llama.cpp | 2 + 3 files changed, 78 insertions(+), 30 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 75a2ad480877d..c8061f3ae7404 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -43,19 +43,64 @@ #include #include #include +#include +#include #include #include static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); +static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { + GGML_UNUSED(user_data); + if (level == GGML_LOG_LEVEL_WARN) { + fprintf(stderr, "warning: %s", msg); + } else if (level == GGML_LOG_LEVEL_ERROR) { + fprintf(stderr, "error: %s", msg); + } else { + fprintf(stderr, "%s", msg); + } +} + +ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback; +void * ggml_cuda_log_user_data = NULL; + +GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) { + ggml_cuda_log_callback = log_callback; + ggml_cuda_log_user_data = user_data; +} + +#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) + +GGML_ATTRIBUTE_FORMAT(2, 3) +static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) { + if (ggml_cuda_log_callback != NULL) { + va_list args; + va_start(args, format); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data); + } else { + std::string buffer2(len, '\0'); + va_end(args); + va_start(args, format); + vsnprintf(&buffer2[0], len + 1, format, args); + ggml_cuda_log_callback(level, buffer2.c_str(), ggml_cuda_log_user_data); + } + va_end(args); + } +} + [[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { int id = -1; // in case cudaGetDevice fails cudaGetDevice(&id); - fprintf(stderr, "CUDA error: %s\n", msg); - fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line); - fprintf(stderr, " %s\n", stmt); + GGML_CUDA_LOG_INFO("CUDA error: %s\n", msg); + GGML_CUDA_LOG_INFO(" current device: %d, in function %s at %s:%d\n", id, func, file, line); + GGML_CUDA_LOG_INFO(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace GGML_ASSERT(!"CUDA error"); } @@ -91,7 +136,7 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaError_t err = cudaGetDeviceCount(&info.device_count); if (err != cudaSuccess) { - fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); + GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); return info; } @@ -99,16 +144,16 @@ static ggml_cuda_device_info ggml_cuda_init() { int64_t total_vram = 0; #if defined(GGML_CUDA_FORCE_MMQ) - fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); + GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); #else - fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); + GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); #endif #if defined(CUDA_USE_TENSOR_CORES) - fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__); + GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__); #else - fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__); + GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__); #endif - fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); + GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; @@ -129,7 +174,7 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -235,8 +280,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { *actual_size = look_ahead_size; pool_size += look_ahead_size; #ifdef DEBUG_CUDA_MALLOC - fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, - (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024)); + GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, + (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024)); #endif return ptr; } @@ -250,7 +295,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { return; } } - fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); ggml_cuda_set_device(device); CUDA_CHECK(cudaFree(ptr)); pool_size -= size; @@ -499,7 +544,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe void * dev_ptr; cudaError_t err = cudaMalloc(&dev_ptr, size); if (err != cudaSuccess) { - fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); + GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err)); return nullptr; } @@ -1002,8 +1047,8 @@ static void * ggml_cuda_host_malloc(size_t size) { if (err != cudaSuccess) { // clear the error cudaGetLastError(); - fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, - size/1024.0/1024.0, cudaGetErrorString(err)); + GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + size / 1024.0 / 1024.0, cudaGetErrorString(err)); return nullptr; } @@ -2246,7 +2291,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg break; case GGML_OP_MUL_MAT: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { - fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); + GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); return false; } else { ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); @@ -2300,7 +2345,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst)); + GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst)); CUDA_CHECK(err); } @@ -2476,7 +2521,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG - fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__); + GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif } } @@ -2523,14 +2568,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG - fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__); + GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); #endif } if (node->op == GGML_OP_MUL_MAT_ID) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG - fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__); + GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); #endif } @@ -2539,7 +2584,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t // Changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG - fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); #endif } @@ -2567,7 +2612,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; #ifndef NDEBUG - fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); + GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); #endif } } @@ -2605,7 +2650,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); if (!ok) { - fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); } @@ -2624,7 +2669,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t use_cuda_graph = false; cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; #ifndef NDEBUG - fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__); + GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); #endif } else { graph_evaluated_or_captured = true; // CUDA graph has been captured @@ -2691,7 +2736,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); if (stat == cudaErrorGraphExecUpdateFailure) { #ifndef NDEBUG - fprintf(stderr, "%s: CUDA graph update failed\n", __func__); + GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__); #endif // The pre-existing graph exec cannot be updated due to violated constraints // so instead clear error and re-instantiate @@ -2948,13 +2993,13 @@ static ggml_guid_t ggml_backend_cuda_guid() { GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { if (device < 0 || device >= ggml_backend_cuda_get_device_count()) { - fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); + GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device); return nullptr; } ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device); if (ctx == nullptr) { - fprintf(stderr, "%s: error: failed to allocate context\n", __func__); + GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__); return nullptr; } @@ -2998,8 +3043,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size // clear the error cudaGetLastError(); - fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__, - size/1024.0/1024.0, cudaGetErrorString(err)); + GGML_CUDA_LOG_WARN(stderr, "%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, + size / 1024.0 / 1024.0, cudaGetErrorString(err)); return false; } return true; diff --git a/ggml-cuda.h b/ggml-cuda.h index 5eb4af40f4d1f..d7903c666cebf 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); +GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data); #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index adbcc07e20fc5..b25b5494f2ea6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -18036,6 +18036,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) { g_state.log_callback_user_data = user_data; #ifdef GGML_USE_METAL ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); +#elif GGML_USE_CUDA + ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); #endif } From 9f8d92d6900d4281b8f572264f2bff49875e5f77 Mon Sep 17 00:00:00 2001 From: "Y. Velkov" Date: Wed, 15 May 2024 12:52:09 +0300 Subject: [PATCH 2/6] fix compile error --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c8061f3ae7404..c8e51f0aa8cf6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3043,7 +3043,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size // clear the error cudaGetLastError(); - GGML_CUDA_LOG_WARN(stderr, "%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, + GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, cudaGetErrorString(err)); return false; } From 89b3236152d97e0361939091b467d4530226e9fc Mon Sep 17 00:00:00 2001 From: "Y. Velkov" Date: Wed, 15 May 2024 13:29:50 +0300 Subject: [PATCH 3/6] fix: vsnprintf terminates with 0, string use not correct --- ggml-cuda.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c8e51f0aa8cf6..8e206f6ffba49 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -83,11 +83,11 @@ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) { if (len < 128) { ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data); } else { - std::string buffer2(len, '\0'); + std::vector buffer2(len + 1); // vsnprintf adds a null terminator va_end(args); va_start(args, format); - vsnprintf(&buffer2[0], len + 1, format, args); - ggml_cuda_log_callback(level, buffer2.c_str(), ggml_cuda_log_user_data); + vsnprintf(&buffer2[0], buffer2.size(), format, args); + ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data); } va_end(args); } From e02d3d0c29cd68f4d3ab7404047af1c4bc240fe3 Mon Sep 17 00:00:00 2001 From: "Y. Velkov" Date: Thu, 16 May 2024 09:57:32 +0300 Subject: [PATCH 4/6] post review --- ggml-cuda.cu | 15 +++++---------- llama.cpp | 2 ++ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 8e206f6ffba49..6e4e7ff7e7ca5 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -51,14 +51,9 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { + GGML_UNUSED(level); GGML_UNUSED(user_data); - if (level == GGML_LOG_LEVEL_WARN) { - fprintf(stderr, "warning: %s", msg); - } else if (level == GGML_LOG_LEVEL_ERROR) { - fprintf(stderr, "error: %s", msg); - } else { - fprintf(stderr, "%s", msg); - } + fprintf(stderr, "%s", msg); } ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback; @@ -98,9 +93,9 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in int id = -1; // in case cudaGetDevice fails cudaGetDevice(&id); - GGML_CUDA_LOG_INFO("CUDA error: %s\n", msg); - GGML_CUDA_LOG_INFO(" current device: %d, in function %s at %s:%d\n", id, func, file, line); - GGML_CUDA_LOG_INFO(" %s\n", stmt); + GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg); + GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); + GGML_CUDA_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace GGML_ASSERT(!"CUDA error"); } diff --git a/llama.cpp b/llama.cpp index b25b5494f2ea6..250bdd845979d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1778,6 +1778,8 @@ struct llama_state { llama_state() { #ifdef GGML_USE_METAL ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data); +#elif GGML_USE_CUDA + ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data); #endif } From 564a650cc7ac0c4222e400004c5ab998b36fe681 Mon Sep 17 00:00:00 2001 From: fraxy-v <65565042+fraxy-v@users.noreply.github.com> Date: Sat, 18 May 2024 09:42:38 +0300 Subject: [PATCH 5/6] Update llama.cpp Co-authored-by: slaren --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 250bdd845979d..a16846bbdade8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1778,7 +1778,7 @@ struct llama_state { llama_state() { #ifdef GGML_USE_METAL ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data); -#elif GGML_USE_CUDA +#elif defined(GGML_USE_CUDA) ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data); #endif } From 7e4d3d7e2217eb4907840087ecfaf2e5fa823f9e Mon Sep 17 00:00:00 2001 From: fraxy-v <65565042+fraxy-v@users.noreply.github.com> Date: Sat, 18 May 2024 09:42:53 +0300 Subject: [PATCH 6/6] Update llama.cpp Co-authored-by: slaren --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a16846bbdade8..9c26e4f3d6bbf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -18038,7 +18038,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) { g_state.log_callback_user_data = user_data; #ifdef GGML_USE_METAL ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); -#elif GGML_USE_CUDA +#elif defined(GGML_USE_CUDA) ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); #endif }