Skip to content

Direct I/O and Transparent HugePages #7420

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1638,6 +1638,17 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
params.use_mmap = false;
}
));
add_opt(llama_arg(
{"--direct-io"},
"use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution)",
[](gpt_params & params) {
if (llama_supports_direct_io()) {
params.use_direct_io = true;
} else {
fprintf(stderr, "warning: direct I/O is not available, --direct-io option will be ignored\n");
}
}
));
add_opt(llama_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
Expand Down Expand Up @@ -2742,6 +2753,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_direct_io = params.use_direct_io;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
if (params.kv_overrides.empty()) {
Expand Down Expand Up @@ -3780,6 +3792,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
fprintf(stream, "direct-io: %s # default: false\n", params.use_direct_io ? "true" : "false");
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ struct gpt_params {
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_direct_io = false; // use direct I/O
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
Expand Down
1 change: 1 addition & 0 deletions examples/llama-bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ options:
-nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
-dio, --direct-io <0|1> (default: 0)
--numa <distribute|isolate|numactl> (default: disabled)
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
Expand Down
32 changes: 29 additions & 3 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ struct cmd_params {
std::vector<bool> flash_attn;
std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
std::vector<bool> use_direct_io;
std::vector<bool> embeddings;
ggml_numa_strategy numa;
int reps;
Expand Down Expand Up @@ -275,6 +276,7 @@ static const cmd_params cmd_params_defaults = {
/* flash_attn */ {false},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* use_direct_io */ {false},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
Expand Down Expand Up @@ -312,6 +314,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
Expand Down Expand Up @@ -559,6 +562,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<bool>(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-dio" || arg == "--direct-io") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
} else if (arg == "-embd" || arg == "--embeddings") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -650,6 +660,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.use_direct_io.empty()){ params.use_direct_io = cmd_params_defaults.use_direct_io; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
Expand Down Expand Up @@ -679,6 +690,7 @@ struct cmd_params_instance {
bool flash_attn;
std::vector<float> tensor_split;
bool use_mmap;
bool use_direct_io;
bool embeddings;

llama_model_params to_llama_mparams() const {
Expand All @@ -692,6 +704,7 @@ struct cmd_params_instance {
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
mparams.use_mmap = use_mmap;
mparams.use_direct_io = use_direct_io;

return mparams;
}
Expand All @@ -703,6 +716,7 @@ struct cmd_params_instance {
split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
use_mmap == other.use_mmap &&
use_direct_io == other.use_direct_io &&
tensor_split == other.tensor_split;
}

Expand Down Expand Up @@ -733,6 +747,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & mmp : params.use_mmap)
for (const auto & dio : params.use_direct_io)
for (const auto & embd : params.embeddings)
for (const auto & nb : params.n_batch)
for (const auto & nub : params.n_ubatch)
Expand Down Expand Up @@ -768,6 +783,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .use_direct_io= */ dio,
/* .embeddings = */ embd,
};
instances.push_back(instance);
Expand Down Expand Up @@ -797,6 +813,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .use_direct_io= */ dio,
/* .embeddings = */ embd,
};
instances.push_back(instance);
Expand Down Expand Up @@ -826,6 +843,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .use_direct_io= */ dio,
/* .embeddings = */ embd,
};
instances.push_back(instance);
Expand Down Expand Up @@ -867,6 +885,7 @@ struct test {
bool flash_attn;
std::vector<float> tensor_split;
bool use_mmap;
bool use_direct_io;
bool embeddings;
int n_prompt;
int n_gen;
Expand Down Expand Up @@ -896,6 +915,7 @@ struct test {
flash_attn = inst.flash_attn;
tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
use_direct_io = inst.use_direct_io;
embeddings = inst.embeddings;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
Expand Down Expand Up @@ -967,7 +987,7 @@ struct test {
"type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings",
"tensor_split", "use_mmap", "use_direct_io", "embeddings",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts",
Expand All @@ -989,7 +1009,7 @@ struct test {
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "cpu_strict" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
field == "flash_attn" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
Expand Down Expand Up @@ -1025,7 +1045,7 @@ struct test {
ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
tensor_split_str, std::to_string(use_mmap), std::to_string(use_direct_io), std::to_string(embeddings),
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
Expand Down Expand Up @@ -1241,6 +1261,9 @@ struct markdown_printer : public printer {
if (field == "use_mmap") {
return "mmap";
}
if (field == "use_direct_io") {
return "direct_io";
}
if (field == "embeddings") {
return "embd";
}
Expand Down Expand Up @@ -1302,6 +1325,9 @@ struct markdown_printer : public printer {
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.emplace_back("use_mmap");
}
if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
fields.emplace_back("use_direct_io");
}
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
fields.emplace_back("embeddings");
}
Expand Down
4 changes: 4 additions & 0 deletions examples/main/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,10 @@ These options help improve the performance and memory usage of the LLaMA models.

- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.

### Direct I/O

- `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution. You may benefit from this option if you load a model for the first time (or after some time), load several different models consecutively, or simply want to keep the page cache clean. The faster your storage device is, the greater the gain you can expect. The effect may be greater on Linux due to Transparent HugePage support.

### NUMA support

- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
Expand Down
1 change: 1 addition & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ The project is under active development, and we are [looking for feedback and co
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
| `--direct-io` | use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
Expand Down
2 changes: 2 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_direct_io; // use direct I/O if possible
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
};
Expand Down Expand Up @@ -429,6 +430,7 @@ extern "C" {
LLAMA_API size_t llama_max_devices(void);

LLAMA_API bool llama_supports_mmap (void);
LLAMA_API bool llama_supports_direct_io (void);
LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_gpu_offload(void);

Expand Down
4 changes: 2 additions & 2 deletions scripts/run-with-preset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
"direct-io", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
Expand All @@ -30,7 +30,7 @@
]

CLI_ARGS_LLAMA_SERVER = [
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
"alias", "batch-size", "ctx-size", "direct-io", "embedding", "host", "memory-f32", "lora", "lora-base",
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
"threads", "verbose"
Expand Down
Loading