Skip to content

Commit 2abce2c

Browse files
ggerganovolexiyb
authored andcommitted
llama : fix data units (ggml-org#4101)
* llama : fix data units ggml-ci * Revert "llama : fix data units" This reverts commit f5feac8. * llama : disambiguate data units ggml-ci
1 parent 078db1b commit 2abce2c

File tree

3 files changed

+25
-25
lines changed

3 files changed

+25
-25
lines changed

ggml-cuda.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -5840,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
58405840
return ptr;
58415841
}
58425842
#ifdef DEBUG_CUDA_MALLOC
5843-
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5843+
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
58445844
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
58455845
#endif
58465846
void * ptr;
@@ -5978,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
59785978
// The allocation error can be bypassed. A null ptr will assigned out of this function.
59795979
// This can fixed the OOM error in WSL.
59805980
cudaGetLastError();
5981-
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
5981+
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
59825982
size/1024.0/1024.0, cudaGetErrorString(err));
59835983
return nullptr;
59845984
}

ggml-metal.m

+7-7
Original file line numberDiff line numberDiff line change
@@ -345,10 +345,10 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
345345
}
346346
}
347347

348-
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
349-
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
348+
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
349+
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MiB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
350350
if (ctx->device.maxTransferRate != 0) {
351-
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
351+
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MiB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
352352
} else {
353353
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
354354
}
@@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
541541
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
542542

543543
if (ctx->buffers[ctx->n_buffers].metal == nil) {
544-
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
544+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
545545
return false;
546546
}
547547

548-
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
548+
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
549549

550550
++ctx->n_buffers;
551551
} else {
@@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
565565
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
566566

567567
if (ctx->buffers[ctx->n_buffers].metal == nil) {
568-
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
568+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
569569
return false;
570570
}
571571

572-
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
572+
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
573573
if (i + size_step < size) {
574574
GGML_METAL_LOG_INFO("\n");
575575
}

llama.cpp

+16-16
Original file line numberDiff line numberDiff line change
@@ -1087,9 +1087,9 @@ enum e_model {
10871087
MODEL_70B,
10881088
};
10891089

1090-
static const size_t kB = 1024;
1091-
static const size_t MB = 1024*kB;
1092-
static const size_t GB = 1024*MB;
1090+
static const size_t kiB = 1024;
1091+
static const size_t MiB = 1024*kiB;
1092+
static const size_t GiB = 1024*MiB;
10931093

10941094
struct llama_hparams {
10951095
bool vocab_only;
@@ -1488,7 +1488,7 @@ static bool llama_kv_cache_init(
14881488
vram_kv_cache += ggml_nbytes(cache.k);
14891489
}
14901490
if (vram_kv_cache > 0) {
1491-
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1491+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
14921492
}
14931493
}
14941494
#endif
@@ -2543,8 +2543,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
25432543
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
25442544
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
25452545
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2546-
if (ml.n_bytes < GB) {
2547-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2546+
if (ml.n_bytes < GiB) {
2547+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
25482548
} else {
25492549
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
25502550
}
@@ -2582,7 +2582,7 @@ static void llm_load_tensors(
25822582

25832583
ml.calc_sizes(ctx_size, mmapped_size);
25842584

2585-
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
2585+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
25862586

25872587
// create the ggml context
25882588
{
@@ -3231,7 +3231,7 @@ static void llm_load_tensors(
32313231
ctx_size +
32323232
mmapped_size - vram_weights; // weights in VRAM not in memory
32333233

3234-
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
3234+
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
32353235

32363236
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
32373237
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3250,7 +3250,7 @@ static void llm_load_tensors(
32503250
#endif // GGML_USE_CUBLAS
32513251

32523252
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3253-
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
3253+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
32543254
#else
32553255
(void) n_gpu_layers;
32563256
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -7962,7 +7962,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
79627962
workers.clear();
79637963
}
79647964

7965-
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7965+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
79667966
int64_t tot_count = 0;
79677967
for (size_t i = 0; i < hist_cur.size(); i++) {
79687968
hist_all[i] += hist_cur[i];
@@ -8502,7 +8502,7 @@ struct llama_context * llama_new_context_with_model(
85028502

85038503
{
85048504
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8505-
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
8505+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
85068506
}
85078507

85088508
// resized during inference
@@ -8547,7 +8547,7 @@ struct llama_context * llama_new_context_with_model(
85478547
// measure memory requirements for the graph
85488548
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
85498549

8550-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8550+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
85518551

85528552
// recreate allocator with exact memory requirements
85538553
ggml_allocr_free(ctx->alloc);
@@ -8561,7 +8561,7 @@ struct llama_context * llama_new_context_with_model(
85618561
#endif
85628562
#ifdef GGML_USE_CUBLAS
85638563
ggml_cuda_set_scratch_size(alloc_size);
8564-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
8564+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
85658565

85668566
// calculate total VRAM usage
85678567
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8581,10 +8581,10 @@ struct llama_context * llama_new_context_with_model(
85818581
size_t ctx_vram_size = alloc_size + kv_vram_size;
85828582
size_t total_vram_size = model_vram_size + ctx_vram_size;
85838583

8584-
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
8584+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
85858585
total_vram_size / 1024.0 / 1024.0,
85868586
model_vram_size / 1024.0 / 1024.0,
8587-
ctx_vram_size / 1024.0 / 1024.0);
8587+
ctx_vram_size / 1024.0 / 1024.0);
85888588
#endif
85898589
}
85908590

@@ -8605,7 +8605,7 @@ struct llama_context * llama_new_context_with_model(
86058605

86068606
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
86078607

8608-
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
8608+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
86098609

86108610
#define LLAMA_METAL_CHECK_BUF(result) \
86118611
if (!(result)) { \

0 commit comments

Comments
 (0)