43
43
#include < mutex>
44
44
#include < stdint.h>
45
45
#include < stdio.h>
46
+ #include < stdarg.h>
47
+ #include < stdlib.h>
46
48
#include < string>
47
49
#include < vector>
48
50
49
51
static_assert (sizeof (half) == sizeof(ggml_fp16_t ), "wrong fp16 size");
50
52
53
+ static void ggml_cuda_default_log_callback (enum ggml_log_level level, const char * msg, void * user_data) {
54
+ GGML_UNUSED (level);
55
+ GGML_UNUSED (user_data);
56
+ fprintf (stderr, " %s" , msg);
57
+ }
58
+
59
+ ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
+ void * ggml_cuda_log_user_data = NULL ;
61
+
62
+ GGML_API void ggml_backend_cuda_log_set_callback (ggml_log_callback log_callback, void * user_data) {
63
+ ggml_cuda_log_callback = log_callback;
64
+ ggml_cuda_log_user_data = user_data;
65
+ }
66
+
67
+ #define GGML_CUDA_LOG_INFO (...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
+ #define GGML_CUDA_LOG_WARN (...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
+ #define GGML_CUDA_LOG_ERROR (...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
+
71
+ GGML_ATTRIBUTE_FORMAT (2 , 3 )
72
+ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
+ if (ggml_cuda_log_callback != NULL ) {
74
+ va_list args;
75
+ va_start (args, format);
76
+ char buffer[128 ];
77
+ int len = vsnprintf (buffer, 128 , format, args);
78
+ if (len < 128 ) {
79
+ ggml_cuda_log_callback (level, buffer, ggml_cuda_log_user_data);
80
+ } else {
81
+ std::vector<char > buffer2 (len + 1 ); // vsnprintf adds a null terminator
82
+ va_end (args);
83
+ va_start (args, format);
84
+ vsnprintf (&buffer2[0 ], buffer2.size (), format, args);
85
+ ggml_cuda_log_callback (level, buffer2.data (), ggml_cuda_log_user_data);
86
+ }
87
+ va_end (args);
88
+ }
89
+ }
90
+
51
91
[[noreturn]]
52
92
void ggml_cuda_error (const char * stmt, const char * func, const char * file, int line, const char * msg) {
53
93
int id = -1 ; // in case cudaGetDevice fails
54
94
cudaGetDevice (&id);
55
95
56
- fprintf (stderr, " CUDA error: %s\n " , msg);
57
- fprintf (stderr, " current device: %d, in function %s at %s:%d\n " , id, func, file, line);
58
- fprintf (stderr, " %s\n " , stmt);
96
+ GGML_CUDA_LOG_ERROR ( " CUDA error: %s\n " , msg);
97
+ GGML_CUDA_LOG_ERROR ( " current device: %d, in function %s at %s:%d\n " , id, func, file, line);
98
+ GGML_CUDA_LOG_ERROR ( " %s\n " , stmt);
59
99
// abort with GGML_ASSERT to get a stack trace
60
100
GGML_ASSERT (!" CUDA error" );
61
101
}
@@ -91,24 +131,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
91
131
92
132
cudaError_t err = cudaGetDeviceCount (&info.device_count );
93
133
if (err != cudaSuccess) {
94
- fprintf (stderr, " %s: failed to initialize " GGML_CUDA_NAME " : %s\n " , __func__, cudaGetErrorString (err));
134
+ GGML_CUDA_LOG_ERROR ( " %s: failed to initialize " GGML_CUDA_NAME " : %s\n " , __func__, cudaGetErrorString (err));
95
135
return info;
96
136
}
97
137
98
138
GGML_ASSERT (info.device_count <= GGML_CUDA_MAX_DEVICES);
99
139
100
140
int64_t total_vram = 0 ;
101
141
#if defined(GGML_CUDA_FORCE_MMQ)
102
- fprintf (stderr, " %s: GGML_CUDA_FORCE_MMQ: yes\n " , __func__);
142
+ GGML_CUDA_LOG_INFO ( " %s: GGML_CUDA_FORCE_MMQ: yes\n " , __func__);
103
143
#else
104
- fprintf (stderr, " %s: GGML_CUDA_FORCE_MMQ: no\n " , __func__);
144
+ GGML_CUDA_LOG_INFO ( " %s: GGML_CUDA_FORCE_MMQ: no\n " , __func__);
105
145
#endif
106
146
#if defined(CUDA_USE_TENSOR_CORES)
107
- fprintf (stderr, " %s: CUDA_USE_TENSOR_CORES: yes\n " , __func__);
147
+ GGML_CUDA_LOG_INFO ( " %s: CUDA_USE_TENSOR_CORES: yes\n " , __func__);
108
148
#else
109
- fprintf (stderr, " %s: CUDA_USE_TENSOR_CORES: no\n " , __func__);
149
+ GGML_CUDA_LOG_INFO ( " %s: CUDA_USE_TENSOR_CORES: no\n " , __func__);
110
150
#endif
111
- fprintf (stderr, " %s: found %d " GGML_CUDA_NAME " devices:\n " , __func__, info.device_count );
151
+ GGML_CUDA_LOG_INFO ( " %s: found %d " GGML_CUDA_NAME " devices:\n " , __func__, info.device_count );
112
152
for (int id = 0 ; id < info.device_count ; ++id) {
113
153
int device_vmm = 0 ;
114
154
@@ -129,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
129
169
130
170
cudaDeviceProp prop;
131
171
CUDA_CHECK (cudaGetDeviceProperties (&prop, id));
132
- fprintf (stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n " , id, prop.name , prop.major , prop.minor , device_vmm ? " yes" : " no" );
172
+ GGML_CUDA_LOG_INFO ( " Device %d: %s, compute capability %d.%d, VMM: %s\n " , id, prop.name , prop.major , prop.minor , device_vmm ? " yes" : " no" );
133
173
134
174
info.default_tensor_split [id] = total_vram;
135
175
total_vram += prop.totalGlobalMem ;
@@ -235,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
235
275
*actual_size = look_ahead_size;
236
276
pool_size += look_ahead_size;
237
277
#ifdef DEBUG_CUDA_MALLOC
238
- fprintf (stderr, " %s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n " , __func__, device, nnz,
239
- (uint32_t )(max_size/ 1024 / 1024 ), (uint32_t )(pool_size/ 1024 / 1024 ), (uint32_t )(size/ 1024 / 1024 ));
278
+ GGML_CUDA_LOG_INFO ( " %s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n " , __func__, device, nnz,
279
+ (uint32_t )(max_size / 1024 / 1024 ), (uint32_t )(pool_size / 1024 / 1024 ), (uint32_t )(size / 1024 / 1024 ));
240
280
#endif
241
281
return ptr;
242
282
}
@@ -250,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
250
290
return ;
251
291
}
252
292
}
253
- fprintf (stderr, " WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n " );
293
+ GGML_CUDA_LOG_WARN ( " Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n " );
254
294
ggml_cuda_set_device (device);
255
295
CUDA_CHECK (cudaFree (ptr));
256
296
pool_size -= size;
@@ -499,7 +539,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
499
539
void * dev_ptr;
500
540
cudaError_t err = cudaMalloc (&dev_ptr, size);
501
541
if (err != cudaSuccess) {
502
- fprintf (stderr, " %s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n " , __func__, size/ 1024.0 / 1024.0 , buft_ctx->device , cudaGetErrorString (err));
542
+ GGML_CUDA_LOG_ERROR ( " %s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n " , __func__, size / 1024.0 / 1024.0 , buft_ctx->device , cudaGetErrorString (err));
503
543
return nullptr ;
504
544
}
505
545
@@ -1002,8 +1042,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
1002
1042
if (err != cudaSuccess) {
1003
1043
// clear the error
1004
1044
cudaGetLastError ();
1005
- fprintf (stderr, " %s: warning : failed to allocate %.2f MiB of pinned memory: %s\n " , __func__,
1006
- size/ 1024.0 / 1024.0 , cudaGetErrorString (err));
1045
+ GGML_CUDA_LOG_WARN ( " %s: failed to allocate %.2f MiB of pinned memory: %s\n " , __func__,
1046
+ size / 1024.0 / 1024.0 , cudaGetErrorString (err));
1007
1047
return nullptr ;
1008
1048
}
1009
1049
@@ -2246,7 +2286,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2246
2286
break ;
2247
2287
case GGML_OP_MUL_MAT:
2248
2288
if (dst->src [0 ]->ne [3 ] != dst->src [1 ]->ne [3 ]) {
2249
- fprintf (stderr, " %s: cannot compute %s: src0->ne[3] = %" PRId64 " , src1->ne[3] = %" PRId64 " - fallback to CPU\n " , __func__, dst->name , dst->src [0 ]->ne [3 ], dst->src [1 ]->ne [3 ]);
2289
+ GGML_CUDA_LOG_ERROR ( " %s: cannot compute %s: src0->ne[3] = %" PRId64 " , src1->ne[3] = %" PRId64 " - fallback to CPU\n " , __func__, dst->name , dst->src [0 ]->ne [3 ], dst->src [1 ]->ne [3 ]);
2250
2290
return false ;
2251
2291
} else {
2252
2292
ggml_cuda_mul_mat (ctx, dst->src [0 ], dst->src [1 ], dst);
@@ -2300,7 +2340,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2300
2340
2301
2341
cudaError_t err = cudaGetLastError ();
2302
2342
if (err != cudaSuccess) {
2303
- fprintf (stderr, " %s: %s failed\n " , __func__, ggml_op_desc (dst));
2343
+ GGML_CUDA_LOG_ERROR ( " %s: %s failed\n " , __func__, ggml_op_desc (dst));
2304
2344
CUDA_CHECK (err);
2305
2345
}
2306
2346
@@ -2476,7 +2516,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2476
2516
if (ggml_cuda_info ().devices [cuda_ctx->device ].cc < CC_AMPERE) {
2477
2517
cuda_ctx->cuda_graph ->disable_due_to_gpu_arch = true ;
2478
2518
#ifndef NDEBUG
2479
- fprintf (stderr, " %s: disabling CUDA graphs due to GPU architecture\n " , __func__);
2519
+ GGML_CUDA_LOG_WARN ( " %s: disabling CUDA graphs due to GPU architecture\n " , __func__);
2480
2520
#endif
2481
2521
}
2482
2522
}
@@ -2523,14 +2563,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2523
2563
if (node->src [0 ] && ggml_backend_buffer_is_cuda_split (node->src [0 ]->buffer )) {
2524
2564
use_cuda_graph = false ; // Split buffers are not supported by CUDA graph capture
2525
2565
#ifndef NDEBUG
2526
- fprintf (stderr, " %s: disabling CUDA graphs due to split buffer\n " , __func__);
2566
+ GGML_CUDA_LOG_WARN ( " %s: disabling CUDA graphs due to split buffer\n " , __func__);
2527
2567
#endif
2528
2568
}
2529
2569
2530
2570
if (node->op == GGML_OP_MUL_MAT_ID) {
2531
2571
use_cuda_graph = false ; // This node type is not supported by CUDA graph capture
2532
2572
#ifndef NDEBUG
2533
- fprintf (stderr, " %s: disabling CUDA graphs due to mul_mat_id\n " , __func__);
2573
+ GGML_CUDA_LOG_WARN ( " %s: disabling CUDA graphs due to mul_mat_id\n " , __func__);
2534
2574
#endif
2535
2575
}
2536
2576
@@ -2539,7 +2579,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2539
2579
// Changes in batch size or context size can cause changes to the grid size of some kernels.
2540
2580
use_cuda_graph = false ;
2541
2581
#ifndef NDEBUG
2542
- fprintf (stderr, " %s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n " , __func__, node->name , node->ne [0 ], node->ne [1 ], node->ne [2 ], node->ne [3 ]);
2582
+ GGML_CUDA_LOG_WARN ( " %s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n " , __func__, node->name , node->ne [0 ], node->ne [1 ], node->ne [2 ], node->ne [3 ]);
2543
2583
#endif
2544
2584
}
2545
2585
@@ -2567,7 +2607,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2567
2607
if (cuda_ctx->cuda_graph ->number_consecutive_updates >= 4 ) {
2568
2608
cuda_ctx->cuda_graph ->disable_due_to_too_many_updates = true ;
2569
2609
#ifndef NDEBUG
2570
- fprintf (stderr, " %s: disabling CUDA graphs due to too many consecutive updates\n " , __func__);
2610
+ GGML_CUDA_LOG_WARN ( " %s: disabling CUDA graphs due to too many consecutive updates\n " , __func__);
2571
2611
#endif
2572
2612
}
2573
2613
}
@@ -2605,7 +2645,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2605
2645
2606
2646
bool ok = ggml_cuda_compute_forward (*cuda_ctx, node);
2607
2647
if (!ok) {
2608
- fprintf (stderr, " %s: error : op not supported %s (%s)\n " , __func__, node->name , ggml_op_name (node->op ));
2648
+ GGML_CUDA_LOG_ERROR ( " %s: op not supported %s (%s)\n " , __func__, node->name , ggml_op_name (node->op ));
2609
2649
}
2610
2650
GGML_ASSERT (ok);
2611
2651
}
@@ -2624,7 +2664,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2624
2664
use_cuda_graph = false;
2625
2665
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2626
2666
#ifndef NDEBUG
2627
- fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2667
+ GGML_CUDA_LOG_WARN( "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2628
2668
#endif
2629
2669
} else {
2630
2670
graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2691,7 +2731,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2691
2731
cudaError_t stat = cudaGraphExecUpdate (cuda_ctx->cuda_graph ->instance , cuda_ctx->cuda_graph ->graph , &result_info);
2692
2732
if (stat == cudaErrorGraphExecUpdateFailure) {
2693
2733
#ifndef NDEBUG
2694
- fprintf (stderr, " %s: CUDA graph update failed\n " , __func__);
2734
+ GGML_CUDA_LOG_ERROR ( " %s: CUDA graph update failed\n " , __func__);
2695
2735
#endif
2696
2736
// The pre-existing graph exec cannot be updated due to violated constraints
2697
2737
// so instead clear error and re-instantiate
@@ -2948,13 +2988,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
2948
2988
2949
2989
GGML_CALL ggml_backend_t ggml_backend_cuda_init (int device) {
2950
2990
if (device < 0 || device >= ggml_backend_cuda_get_device_count ()) {
2951
- fprintf (stderr, " %s: error : invalid device %d\n " , __func__, device);
2991
+ GGML_CUDA_LOG_ERROR ( " %s: invalid device %d\n " , __func__, device);
2952
2992
return nullptr ;
2953
2993
}
2954
2994
2955
2995
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context (device);
2956
2996
if (ctx == nullptr ) {
2957
- fprintf (stderr, " %s: error : failed to allocate context\n " , __func__);
2997
+ GGML_CUDA_LOG_ERROR ( " %s: failed to allocate context\n " , __func__);
2958
2998
return nullptr ;
2959
2999
}
2960
3000
@@ -2998,8 +3038,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2998
3038
// clear the error
2999
3039
cudaGetLastError ();
3000
3040
3001
- fprintf (stderr, " %s: warning : failed to register %.2f MiB of pinned memory: %s\n " , __func__,
3002
- size/ 1024.0 / 1024.0 , cudaGetErrorString (err));
3041
+ GGML_CUDA_LOG_WARN ( " %s: failed to register %.2f MiB of pinned memory: %s\n " , __func__,
3042
+ size / 1024.0 / 1024.0 , cudaGetErrorString (err));
3003
3043
return false ;
3004
3044
}
3005
3045
return true ;
0 commit comments