Skip to content

Commit 2d77050

Browse files
authored
llama : remove mtest (ggml-org#3177)
* Remove mtest * remove from common/common.h and examples/main/main.cpp
1 parent 98311c4 commit 2d77050

File tree

5 files changed

+3
-26
lines changed

5 files changed

+3
-26
lines changed

common/common.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -434,8 +434,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
434434
#endif // GGML_USE_CUBLAS
435435
} else if (arg == "--no-mmap") {
436436
params.use_mmap = false;
437-
} else if (arg == "--mtest") {
438-
params.mem_test = true;
439437
} else if (arg == "--numa") {
440438
params.numa = true;
441439
} else if (arg == "--export") {
@@ -687,7 +685,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
687685
printf(" Not recommended since this is both slower and uses more VRAM.\n");
688686
#endif // GGML_USE_CUBLAS
689687
#endif
690-
printf(" --mtest compute maximum memory usage\n");
691688
printf(" --export export the computation graph to 'llama.ggml'\n");
692689
printf(" --verbose-prompt print prompt before generation\n");
693690
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
@@ -1225,7 +1222,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
12251222
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
12261223
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
12271224
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
1228-
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
12291225
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
12301226
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
12311227
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);

common/common.h

-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ struct gpt_params {
110110
bool perplexity = false; // compute perplexity over the prompt
111111
bool use_mmap = true; // use mmap for faster loads
112112
bool use_mlock = false; // use mlock to keep model in memory
113-
bool mem_test = false; // compute maximum memory usage
114113
bool numa = false; // attempt optimizations that help on some NUMA systems
115114
bool export_cgraph = false; // export the computation graph
116115
bool verbose_prompt = false; // print prompt tokens before generation

examples/main/README.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
144144

145145
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
146146

147-
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
147+
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
148148

149149
### Keep Prompt
150150

@@ -274,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models.
274274

275275
### NUMA support
276276

277-
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
277+
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
278278

279279
### Memory Float 32
280280

@@ -302,7 +302,6 @@ These options provide extra functionality and customization when running the LLa
302302

303303
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
304304
- `--verbose-prompt`: Print the prompt before generating text.
305-
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
306305
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
307306
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
308307
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.

examples/main/main.cpp

-17
Original file line numberDiff line numberDiff line change
@@ -198,23 +198,6 @@ int main(int argc, char ** argv) {
198198
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
199199
}
200200

201-
// determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
202-
// uncomment the "used_mem" line in llama.cpp to see the results
203-
if (params.mem_test) {
204-
{
205-
LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
206-
207-
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
208-
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
209-
}
210-
211-
llama_print_timings(ctx);
212-
llama_free(ctx);
213-
llama_free_model(model);
214-
215-
return 0;
216-
}
217-
218201
// export the cgraph and exit
219202
if (params.export_cgraph) {
220203
llama_eval_export(ctx, "llama.ggml");

run_with_preset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
1414
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
1515
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
16-
"model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
16+
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
1717
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
1818
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
1919
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",

0 commit comments

Comments
 (0)