ggml-org · pavelfatin · May 20, 2024
diff --git a/common/common.cpp b/common/common.cpp
@@ -1638,6 +1638,17 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             params.use_mmap = false;
         }
     ));
+    add_opt(llama_arg(
+        {"--direct-io"},
+        "use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution)",
+        [](gpt_params & params) {
+            if (llama_supports_direct_io()) {
+                params.use_direct_io = true;
+            } else {
+                fprintf(stderr, "warning: direct I/O is not available, --direct-io option will be ignored\n");
+            }
+        }
+    ));
     add_opt(llama_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"
@@ -2742,6 +2753,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
+    mparams.use_direct_io   = params.use_direct_io;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
     if (params.kv_overrides.empty()) {
@@ -3780,6 +3792,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
     fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
+    fprintf(stream, "direct-io: %s # default: false\n", params.use_direct_io ? "true" : "false");
     fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

diff --git a/common/common.h b/common/common.h
@@ -208,6 +208,7 @@ struct gpt_params {
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
+    bool use_direct_io     = false; // use direct I/O
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation

diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
@@ -43,6 +43,7 @@ options:
   -nkvo, --no-kv-offload <0|1>              (default: 0)
   -fa, --flash-attn <0|1>                   (default: 0)
   -mmp, --mmap <0|1>                        (default: 1)
+  -dio, --direct-io <0|1>                   (default: 0)
   --numa <distribute|isolate|numactl>       (default: disabled)
   -embd, --embeddings <0|1>                 (default: 0)
   -ts, --tensor-split <ts0/ts1/..>          (default: 0)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -243,6 +243,7 @@ struct cmd_params {
     std::vector<bool> flash_attn;
     std::vector<std::vector<float>> tensor_split;
     std::vector<bool> use_mmap;
+    std::vector<bool> use_direct_io;
     std::vector<bool> embeddings;
     ggml_numa_strategy numa;
     int reps;
@@ -275,6 +276,7 @@ static const cmd_params cmd_params_defaults = {
     /* flash_attn           */ {false},
     /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
     /* use_mmap             */ {true},
+    /* use_direct_io        */ {false},
     /* embeddings           */ {false},
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
     /* reps                 */ 5,
@@ -312,6 +314,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
     printf("  -fa, --flash-attn <0|1>                   (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
     printf("  -mmp, --mmap <0|1>                        (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  -dio, --direct-io <0|1>                   (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
     printf("  --numa <distribute|isolate|numactl>       (default: disabled)\n");
     printf("  -embd, --embeddings <0|1>                 (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
@@ -559,6 +562,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = string_split<bool>(argv[i], split_delim);
             params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
+        } else if (arg == "-dio" || arg == "--direct-io") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
         } else if (arg == "-embd" || arg == "--embeddings") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -650,6 +660,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.flash_attn.empty())   { params.flash_attn = cmd_params_defaults.flash_attn; }
     if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
     if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
+    if (params.use_direct_io.empty()){ params.use_direct_io = cmd_params_defaults.use_direct_io; }
     if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
     if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
     if (params.cpu_mask.empty())     { params.cpu_mask  = cmd_params_defaults.cpu_mask;  }
@@ -679,6 +690,7 @@ struct cmd_params_instance {
     bool flash_attn;
     std::vector<float> tensor_split;
     bool use_mmap;
+    bool use_direct_io;
     bool embeddings;
 
     llama_model_params to_llama_mparams() const {
@@ -692,6 +704,7 @@ struct cmd_params_instance {
         mparams.main_gpu = main_gpu;
         mparams.tensor_split = tensor_split.data();
         mparams.use_mmap = use_mmap;
+        mparams.use_direct_io = use_direct_io;
 
         return mparams;
     }
@@ -703,6 +716,7 @@ struct cmd_params_instance {
                split_mode == other.split_mode &&
                main_gpu == other.main_gpu &&
                use_mmap == other.use_mmap &&
+               use_direct_io == other.use_direct_io &&
                tensor_split == other.tensor_split;
     }
 
@@ -733,6 +747,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & mg : params.main_gpu)
     for (const auto & ts : params.tensor_split)
     for (const auto & mmp : params.use_mmap)
+    for (const auto & dio : params.use_direct_io)
     for (const auto & embd : params.embeddings)
     for (const auto & nb : params.n_batch)
     for (const auto & nub : params.n_ubatch)
@@ -768,6 +783,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
                 /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                 /* .embeddings   = */ embd,
             };
             instances.push_back(instance);
@@ -797,6 +813,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
                 /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                 /* .embeddings   = */ embd,
             };
             instances.push_back(instance);
@@ -826,6 +843,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
                 /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                 /* .embeddings   = */ embd,
             };
             instances.push_back(instance);
@@ -867,6 +885,7 @@ struct test {
     bool flash_attn;
     std::vector<float> tensor_split;
     bool use_mmap;
+    bool use_direct_io;
     bool embeddings;
     int n_prompt;
     int n_gen;
@@ -896,6 +915,7 @@ struct test {
         flash_attn = inst.flash_attn;
         tensor_split = inst.tensor_split;
         use_mmap = inst.use_mmap;
+        use_direct_io = inst.use_direct_io;
         embeddings = inst.embeddings;
         n_prompt = inst.n_prompt;
         n_gen = inst.n_gen;
@@ -967,7 +987,7 @@ struct test {
             "type_k", "type_v",
             "n_gpu_layers", "split_mode",
             "main_gpu", "no_kv_offload", "flash_attn",
-            "tensor_split", "use_mmap", "embeddings",
+            "tensor_split", "use_mmap", "use_direct_io", "embeddings",
             "n_prompt", "n_gen", "test_time",
             "avg_ns", "stddev_ns",
             "avg_ts", "stddev_ts",
@@ -989,7 +1009,7 @@ struct test {
         if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
             field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
             field == "cpu_strict" ||
-            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
+            field == "flash_attn" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts") {
@@ -1025,7 +1045,7 @@ struct test {
             ggml_type_name(type_k), ggml_type_name(type_v),
             std::to_string(n_gpu_layers), split_mode_str(split_mode),
             std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
-            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
+            tensor_split_str, std::to_string(use_mmap), std::to_string(use_direct_io), std::to_string(embeddings),
             std::to_string(n_prompt), std::to_string(n_gen), test_time,
             std::to_string(avg_ns()), std::to_string(stdev_ns()),
             std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -1241,6 +1261,9 @@ struct markdown_printer : public printer {
         if (field == "use_mmap") {
             return "mmap";
         }
+        if (field == "use_direct_io") {
+            return "direct_io";
+        }
         if (field == "embeddings") {
             return "embd";
         }
@@ -1302,6 +1325,9 @@ struct markdown_printer : public printer {
         if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
             fields.emplace_back("use_mmap");
         }
+        if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
+            fields.emplace_back("use_direct_io");
+        }
         if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
             fields.emplace_back("embeddings");
         }

diff --git a/examples/main/README.md b/examples/main/README.md
@@ -274,6 +274,10 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
 
+### Direct I/O
+
+-   `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution. You may benefit from this option if you load a model for the first time (or after some time), load several different models consecutively, or simply want to keep the page cache clean. The faster your storage device is, the greater the gain you can expect. The effect may be greater on Linux due to Transparent HugePage support.
+
 ### NUMA support
 
 -   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -97,6 +97,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
+| `--direct-io` | use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
 | `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |

diff --git a/include/llama.h b/include/llama.h
@@ -303,6 +303,7 @@ extern "C" {
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only;    // only load the vocabulary, no weights
         bool use_mmap;      // use mmap if possible
+        bool use_direct_io; // use direct I/O if possible
         bool use_mlock;     // force system to keep model in RAM
         bool check_tensors; // validate model tensor data
     };
@@ -429,6 +430,7 @@ extern "C" {
     LLAMA_API size_t llama_max_devices(void);
 
     LLAMA_API bool llama_supports_mmap       (void);
+    LLAMA_API bool llama_supports_direct_io  (void);
     LLAMA_API bool llama_supports_mlock      (void);
     LLAMA_API bool llama_supports_gpu_offload(void);
 

diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py
@@ -12,7 +12,7 @@
 
 CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
     "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
-    "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
+    "direct-io", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
     "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
     "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
     "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
@@ -30,7 +30,7 @@
 ]
 
 CLI_ARGS_LLAMA_SERVER = [
-    "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
+    "alias", "batch-size", "ctx-size", "direct-io", "embedding", "host", "memory-f32", "lora", "lora-base",
     "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
     "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
     "threads", "verbose"