server: docs: --no-mul-mat-q,-nommq

phymbert · phymbert · commit b4b0d53355a6 · 2024-02-28T18:02:55.000+01:00
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -27,6 +27,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
 - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
+- `--no-mul-mat-q,-nommq`: Disable mul_mat_q kernels
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
 - `--numa STRATEGY`: Attempt one of the below optimization strategies  that help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2085,6 +2085,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     {
         printf("  --no-mmap                 do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
     }
+    printf("  --no-mul-mat-q,-nommq     Disable mul_mat_q kernels\n");
     printf("  --numa TYPE               attempt optimizations that help on some NUMA systems\n");
     printf("                              - distribute: spread execution evenly over all nodes\n");
     printf("                              - isolate: only spawn threads on CPUs on the node that execution started on\n");

Original file line number	Diff line number	Diff line change
`@@ -2085,6 +2085,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,`
`2085`	`2085`	`{`
`2086`	`2086`	`printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");`
`2087`	`2087`	`}`
	`2088`	`+ printf(" --no-mul-mat-q,-nommq Disable mul_mat_q kernels\n");`
`2088`	`2089`	`printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");`
`2089`	`2090`	`printf(" - distribute: spread execution evenly over all nodes\n");`
`2090`	`2091`	`printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");`