Skip to content

Commit c6814f6

Browse files
committed
Direct I/O and Transparent HugePages
--direct-io for bypassing page cache (and using THP on Linux) Up to 3-6x faster uncached loading, fewer pageouts, no page cache pollution.
1 parent 917dc8c commit c6814f6

File tree

10 files changed

+295
-30
lines changed

10 files changed

+295
-30
lines changed

common/common.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -1072,6 +1072,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10721072
params.use_mmap = false;
10731073
return true;
10741074
}
1075+
if (arg == "--direct-io") {
1076+
params.use_direct_io = true;
1077+
return true;
1078+
}
10751079
if (arg == "--numa") {
10761080
if (++i >= argc) {
10771081
invalid_param = true;
@@ -1544,6 +1548,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
15441548
if (llama_supports_mmap()) {
15451549
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
15461550
}
1551+
if (llama_supports_direct_io()) {
1552+
printf(" --direct-io use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution)\n");
1553+
}
15471554
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
15481555
printf(" - distribute: spread execution evenly over all nodes\n");
15491556
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
@@ -1844,6 +1851,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
18441851
mparams.split_mode = params.split_mode;
18451852
mparams.tensor_split = params.tensor_split;
18461853
mparams.use_mmap = params.use_mmap;
1854+
mparams.use_direct_io = params.use_direct_io;
18471855
mparams.use_mlock = params.use_mlock;
18481856
mparams.check_tensors = params.check_tensors;
18491857
if (params.kv_overrides.empty()) {
@@ -2706,6 +2714,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
27062714
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
27072715
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
27082716
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2717+
fprintf(stream, "direct-io: %s # default: false\n", params.use_direct_io ? "true" : "false");
27092718
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
27102719
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
27112720
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ struct gpt_params {
160160
bool instruct = false; // instruction mode (used for Alpaca models)
161161
bool logits_all = false; // return logits for all tokens in the batch
162162
bool use_mmap = true; // use mmap for faster loads
163+
bool use_direct_io = false; // use direct I/O
163164
bool use_mlock = false; // use mlock to keep model in memory
164165
bool verbose_prompt = false; // print prompt tokens before generation
165166
bool display_prompt = true; // print prompt before generation

examples/llama-bench/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ options:
3838
-nkvo, --no-kv-offload <0|1> (default: 0)
3939
-fa, --flash-attn <0|1> (default: 0)
4040
-mmp, --mmap <0|1> (default: 1)
41+
-dio, --direct-io <0|1> (default: 0)
4142
--numa <distribute|isolate|numactl> (default: disabled)
4243
-embd, --embeddings <0|1> (default: 0)
4344
-ts, --tensor-split <ts0/ts1/..> (default: 0)

examples/llama-bench/llama-bench.cpp

+29-3
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ struct cmd_params {
184184
std::vector<bool> flash_attn;
185185
std::vector<std::vector<float>> tensor_split;
186186
std::vector<bool> use_mmap;
187+
std::vector<bool> use_direct_io;
187188
std::vector<bool> embeddings;
188189
ggml_numa_strategy numa;
189190
int reps;
@@ -208,6 +209,7 @@ static const cmd_params cmd_params_defaults = {
208209
/* flash_attn */ {false},
209210
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
210211
/* use_mmap */ {true},
212+
/* use_direct_io */ {false},
211213
/* embeddings */ {false},
212214
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
213215
/* reps */ 5,
@@ -235,6 +237,7 @@ static void print_usage(int /* argc */, char ** argv) {
235237
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
236238
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
237239
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
240+
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
238241
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
239242
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
240243
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
@@ -444,6 +447,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
444447
}
445448
auto p = split<bool>(argv[i], split_delim);
446449
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
450+
} else if (arg == "-dio" || arg == "--direct-io") {
451+
if (++i >= argc) {
452+
invalid_param = true;
453+
break;
454+
}
455+
auto p = split<bool>(argv[i], split_delim);
456+
params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
447457
} else if (arg == "-embd" || arg == "--embeddings") {
448458
if (++i >= argc) {
449459
invalid_param = true;
@@ -525,6 +535,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
525535
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
526536
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
527537
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
538+
if (params.use_direct_io.empty()){ params.use_direct_io = cmd_params_defaults.use_direct_io; }
528539
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
529540
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
530541

@@ -547,6 +558,7 @@ struct cmd_params_instance {
547558
bool flash_attn;
548559
std::vector<float> tensor_split;
549560
bool use_mmap;
561+
bool use_direct_io;
550562
bool embeddings;
551563

552564
llama_model_params to_llama_mparams() const {
@@ -557,6 +569,7 @@ struct cmd_params_instance {
557569
mparams.main_gpu = main_gpu;
558570
mparams.tensor_split = tensor_split.data();
559571
mparams.use_mmap = use_mmap;
572+
mparams.use_direct_io = use_direct_io;
560573

561574
return mparams;
562575
}
@@ -567,6 +580,7 @@ struct cmd_params_instance {
567580
split_mode == other.split_mode &&
568581
main_gpu == other.main_gpu &&
569582
use_mmap == other.use_mmap &&
583+
use_direct_io == other.use_direct_io &&
570584
tensor_split == other.tensor_split;
571585
}
572586

@@ -596,6 +610,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
596610
for (const auto & mg : params.main_gpu)
597611
for (const auto & ts : params.tensor_split)
598612
for (const auto & mmp : params.use_mmap)
613+
for (const auto & dio : params.use_direct_io)
599614
for (const auto & embd : params.embeddings)
600615
for (const auto & nb : params.n_batch)
601616
for (const auto & nub : params.n_ubatch)
@@ -624,6 +639,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
624639
/* .flash_attn = */ fa,
625640
/* .tensor_split = */ ts,
626641
/* .use_mmap = */ mmp,
642+
/* .use_direct_io= */ dio,
627643
/* .embeddings = */ embd,
628644
};
629645
instances.push_back(instance);
@@ -649,6 +665,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
649665
/* .flash_attn = */ fa,
650666
/* .tensor_split = */ ts,
651667
/* .use_mmap = */ mmp,
668+
/* .use_direct_io= */ dio,
652669
/* .embeddings = */ embd,
653670
};
654671
instances.push_back(instance);
@@ -674,6 +691,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
674691
/* .flash_attn = */ fa,
675692
/* .tensor_split = */ ts,
676693
/* .use_mmap = */ mmp,
694+
/* .use_direct_io= */ dio,
677695
/* .embeddings = */ embd,
678696
};
679697
instances.push_back(instance);
@@ -712,6 +730,7 @@ struct test {
712730
bool flash_attn;
713731
std::vector<float> tensor_split;
714732
bool use_mmap;
733+
bool use_direct_io;
715734
bool embeddings;
716735
int n_prompt;
717736
int n_gen;
@@ -737,6 +756,7 @@ struct test {
737756
flash_attn = inst.flash_attn;
738757
tensor_split = inst.tensor_split;
739758
use_mmap = inst.use_mmap;
759+
use_direct_io = inst.use_direct_io;
740760
embeddings = inst.embeddings;
741761
n_prompt = inst.n_prompt;
742762
n_gen = inst.n_gen;
@@ -810,7 +830,7 @@ struct test {
810830
"n_threads", "type_k", "type_v",
811831
"n_gpu_layers", "split_mode",
812832
"main_gpu", "no_kv_offload", "flash_attn",
813-
"tensor_split", "use_mmap", "embeddings",
833+
"tensor_split", "use_mmap", "use_direct_io", "embeddings",
814834
"n_prompt", "n_gen", "test_time",
815835
"avg_ns", "stddev_ns",
816836
"avg_ts", "stddev_ts"
@@ -831,7 +851,7 @@ struct test {
831851
}
832852
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
833853
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
834-
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
854+
field == "flash_attn" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings") {
835855
return BOOL;
836856
}
837857
if (field == "avg_ts" || field == "stddev_ts") {
@@ -866,7 +886,7 @@ struct test {
866886
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
867887
std::to_string(n_gpu_layers), split_mode_str(split_mode),
868888
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
869-
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
889+
tensor_split_str, std::to_string(use_mmap), std::to_string(use_direct_io), std::to_string(embeddings),
870890
std::to_string(n_prompt), std::to_string(n_gen), test_time,
871891
std::to_string(avg_ns()), std::to_string(stdev_ns()),
872892
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -1042,6 +1062,9 @@ struct markdown_printer : public printer {
10421062
if (field == "use_mmap") {
10431063
return "mmap";
10441064
}
1065+
if (field == "use_direct_io") {
1066+
return "direct_io";
1067+
}
10451068
if (field == "embeddings") {
10461069
return "embd";
10471070
}
@@ -1094,6 +1117,9 @@ struct markdown_printer : public printer {
10941117
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
10951118
fields.emplace_back("use_mmap");
10961119
}
1120+
if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
1121+
fields.emplace_back("use_direct_io");
1122+
}
10971123
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
10981124
fields.emplace_back("embeddings");
10991125
}

examples/main/README.md

+4
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,10 @@ These options help improve the performance and memory usage of the LLaMA models.
282282

283283
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
284284

285+
### Direct I/O
286+
287+
- `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution. You may benefit from this option if you load a model for the first time (or after some time), load several different models consecutively, or simply want to keep the page cache clean. The faster your storage device is, the greater the gain you can expect. The effect may be greater on Linux due to Transparent HugePage support.
288+
285289
### NUMA support
286290

287291
- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.

examples/server/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ The project is under active development, and we are [looking for feedback and co
3434
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
3535
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
3636
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
37+
- `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution.
3738
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
3839
- `--numa distribute`: Spread execution evenly over all nodes
3940
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on

examples/server/server.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -2352,6 +2352,9 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
23522352
if (llama_supports_mmap()) {
23532353
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
23542354
}
2355+
if (llama_supports_direct_io()) {
2356+
printf(" --direct-io use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution)\n");
2357+
}
23552358
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
23562359
printf(" - distribute: spread execution evenly over all nodes\n");
23572360
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
@@ -2754,6 +2757,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
27542757
params.use_mlock = true;
27552758
} else if (arg == "--no-mmap") {
27562759
params.use_mmap = false;
2760+
} else if (arg == "--direct-io") {
2761+
params.use_direct_io = true;
27572762
} else if (arg == "--numa") {
27582763
if (++i >= argc) {
27592764
invalid_param = true;

0 commit comments

Comments
 (0)