Skip to content

Commit f8e9f11

Browse files
committed
common : add -dkvc arg for enabling kv cache dumps
1 parent 5df7d06 commit f8e9f11

File tree

4 files changed

+14
-5
lines changed

4 files changed

+14
-5
lines changed

common/common.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
496496
params.chatml = true;
497497
} else if (arg == "--infill") {
498498
params.infill = true;
499+
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
500+
params.dump_kv_cache = true;
499501
} else if (arg == "--multiline-input") {
500502
params.multiline_input = true;
501503
} else if (arg == "--simple-io") {
@@ -836,6 +838,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
836838
#endif // GGML_USE_CUBLAS
837839
#endif
838840
printf(" --verbose-prompt print prompt before generation\n");
841+
printf(" -dkvc, --dump-kv-cache\n");
842+
printf(" verbose print of the KV cache\n");
839843
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
840844
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
841845
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ struct gpt_params {
122122
bool numa = false; // attempt optimizations that help on some NUMA systems
123123
bool verbose_prompt = false; // print prompt tokens before generation
124124
bool infill = false; // use infill mode
125+
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
125126

126127
// multimodal models (see examples/llava)
127128
std::string mmproj = ""; // path to multimodal projector

examples/parallel/parallel.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ int main(int argc, char ** argv) {
113113
// insert new requests as soon as the previous one is done
114114
const bool cont_batching = params.cont_batching;
115115

116+
const bool dump_kv_cache = params.dump_kv_cache;
117+
116118
#ifndef LOG_DISABLE_LOGS
117119
log_set_target(log_filename_generator("parallel", "log"));
118120
LOG_TEE("Log start\n");
@@ -203,8 +205,10 @@ int main(int argc, char ** argv) {
203205
LOG_TEE("Processing requests ...\n\n");
204206

205207
while (true) {
206-
llama_kv_cache_view_update(ctx, &kvc_view);
207-
dump_kv_cache_view_seqs(kvc_view, 40);
208+
if (dump_kv_cache) {
209+
llama_kv_cache_view_update(ctx, &kvc_view);
210+
dump_kv_cache_view_seqs(kvc_view, 40);
211+
}
208212

209213
llama_batch_clear(batch);
210214

llama.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -400,13 +400,13 @@ extern "C" {
400400
llama_seq_id * cells_sequences;
401401
};
402402

403-
// Create an empty KV cache view.
403+
// Create an empty KV cache view. (use only for debugging purposes)
404404
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
405405

406-
// Free a KV cache view.
406+
// Free a KV cache view. (use only for debugging purposes)
407407
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
408408

409-
// Update the KV cache view structure with the current state of the KV cache.
409+
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
410410
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
411411

412412
// Returns the number of tokens in the KV cache (slow, use only for debug)

0 commit comments

Comments
 (0)