@@ -1087,9 +1087,9 @@ enum e_model {
1087
1087
MODEL_70B,
1088
1088
};
1089
1089
1090
- static const size_t kB = 1024;
1091
- static const size_t MB = 1024*kB ;
1092
- static const size_t GB = 1024*MB ;
1090
+ static const size_t kiB = 1024;
1091
+ static const size_t MiB = 1024*kiB ;
1092
+ static const size_t GiB = 1024*MiB ;
1093
1093
1094
1094
struct llama_hparams {
1095
1095
bool vocab_only;
@@ -1488,7 +1488,7 @@ static bool llama_kv_cache_init(
1488
1488
vram_kv_cache += ggml_nbytes(cache.k);
1489
1489
}
1490
1490
if (vram_kv_cache > 0) {
1491
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB \n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1491
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB \n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1492
1492
}
1493
1493
}
1494
1494
#endif
@@ -2543,8 +2543,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2543
2543
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2544
2544
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2545
2545
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2546
- if (ml.n_bytes < GB ) {
2547
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2546
+ if (ml.n_bytes < GiB ) {
2547
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2548
2548
} else {
2549
2549
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2550
2550
}
@@ -2582,7 +2582,7 @@ static void llm_load_tensors(
2582
2582
2583
2583
ml.calc_sizes(ctx_size, mmapped_size);
2584
2584
2585
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB \n", __func__, ctx_size/1024.0/1024.0);
2585
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB \n", __func__, ctx_size/1024.0/1024.0);
2586
2586
2587
2587
// create the ggml context
2588
2588
{
@@ -3231,7 +3231,7 @@ static void llm_load_tensors(
3231
3231
ctx_size +
3232
3232
mmapped_size - vram_weights; // weights in VRAM not in memory
3233
3233
3234
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB \n", __func__, mem_required / 1024.0 / 1024.0);
3234
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MiB \n", __func__, mem_required / 1024.0 / 1024.0);
3235
3235
3236
3236
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3237
3237
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3250,7 +3250,7 @@ static void llm_load_tensors(
3250
3250
#endif // GGML_USE_CUBLAS
3251
3251
3252
3252
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3253
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB \n", __func__, vram_weights / 1024.0 / 1024.0);
3253
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB \n", __func__, vram_weights / 1024.0 / 1024.0);
3254
3254
#else
3255
3255
(void) n_gpu_layers;
3256
3256
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -7962,7 +7962,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7962
7962
workers.clear();
7963
7963
}
7964
7964
7965
- LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7965
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7966
7966
int64_t tot_count = 0;
7967
7967
for (size_t i = 0; i < hist_cur.size(); i++) {
7968
7968
hist_all[i] += hist_cur[i];
@@ -8502,7 +8502,7 @@ struct llama_context * llama_new_context_with_model(
8502
8502
8503
8503
{
8504
8504
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8505
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MB \n", __func__, memory_size / 1024.0 / 1024.0);
8505
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB \n", __func__, memory_size / 1024.0 / 1024.0);
8506
8506
}
8507
8507
8508
8508
// resized during inference
@@ -8547,7 +8547,7 @@ struct llama_context * llama_new_context_with_model(
8547
8547
// measure memory requirements for the graph
8548
8548
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
8549
8549
8550
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB \n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8550
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB \n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8551
8551
8552
8552
// recreate allocator with exact memory requirements
8553
8553
ggml_allocr_free(ctx->alloc);
@@ -8561,7 +8561,7 @@ struct llama_context * llama_new_context_with_model(
8561
8561
#endif
8562
8562
#ifdef GGML_USE_CUBLAS
8563
8563
ggml_cuda_set_scratch_size(alloc_size);
8564
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB \n", __func__, alloc_size / 1024.0 / 1024.0);
8564
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB \n", __func__, alloc_size / 1024.0 / 1024.0);
8565
8565
8566
8566
// calculate total VRAM usage
8567
8567
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8581,10 +8581,10 @@ struct llama_context * llama_new_context_with_model(
8581
8581
size_t ctx_vram_size = alloc_size + kv_vram_size;
8582
8582
size_t total_vram_size = model_vram_size + ctx_vram_size;
8583
8583
8584
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB , context: %.2f MB )\n", __func__,
8584
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB , context: %.2f MiB )\n", __func__,
8585
8585
total_vram_size / 1024.0 / 1024.0,
8586
8586
model_vram_size / 1024.0 / 1024.0,
8587
- ctx_vram_size / 1024.0 / 1024.0);
8587
+ ctx_vram_size / 1024.0 / 1024.0);
8588
8588
#endif
8589
8589
}
8590
8590
@@ -8605,7 +8605,7 @@ struct llama_context * llama_new_context_with_model(
8605
8605
8606
8606
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
8607
8607
8608
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB \n", __func__, max_size/1024.0/1024.0);
8608
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB \n", __func__, max_size/1024.0/1024.0);
8609
8609
8610
8610
#define LLAMA_METAL_CHECK_BUF(result) \
8611
8611
if (!(result)) { \
0 commit comments