Skip to content

Commit 3711c43

Browse files
cebtenzzrepkrmf
authored andcommitted
llama : allow gguf RoPE keys to be overridden with defaults (ggml-org#3240)
1 parent 3f4d2b2 commit 3711c43

File tree

3 files changed

+29
-41
lines changed

3 files changed

+29
-41
lines changed

common/common.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -647,9 +647,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
647647
printf(" --cfg-negative-prompt-file FNAME\n");
648648
printf(" negative prompt file to use for guidance. (default: empty)\n");
649649
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
650-
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
651-
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
652-
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
650+
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
651+
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
652+
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
653653
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
654654
printf(" --no-penalize-nl do not penalize newline token\n");
655655
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");

examples/server/server.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -701,8 +701,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
701701
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
702702
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
703703
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
704-
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
705-
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
704+
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
705+
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
706706
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
707707
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
708708
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");

llama.cpp

+24-36
Original file line numberDiff line numberDiff line change
@@ -929,23 +929,22 @@ static const size_t kB = 1024;
929929
static const size_t MB = kB*kB;
930930
static const size_t GB = kB*kB*kB;
931931

932-
// default hparams (LLaMA 7B)
933932
struct llama_hparams {
934-
uint32_t n_vocab = 32000;
935-
uint32_t n_ctx_train = 2048; // the context size used during training
936-
uint32_t n_ctx = 512; // the context size used during inference
937-
uint32_t n_embd = 4096;
938-
uint32_t n_head = 32;
939-
uint32_t n_head_kv = 32;
940-
uint32_t n_layer = 32;
941-
uint32_t n_rot = 64;
942-
uint32_t n_ff = 11008;
943-
944-
float f_norm_eps = 1e-5;
945-
float f_norm_rms_eps = 1e-5;
946-
947-
float rope_freq_base = 10000.0f;
948-
float rope_freq_scale = 1.0f;
933+
uint32_t n_vocab;
934+
uint32_t n_ctx_train; // context size the model was trained on
935+
uint32_t n_ctx; // context size used during inference
936+
uint32_t n_embd;
937+
uint32_t n_head;
938+
uint32_t n_head_kv;
939+
uint32_t n_layer;
940+
uint32_t n_rot;
941+
uint32_t n_ff;
942+
943+
float f_norm_eps;
944+
float f_norm_rms_eps;
945+
946+
float rope_freq_base;
947+
float rope_freq_scale;
949948

950949
bool operator!=(const llama_hparams & other) const {
951950
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -1076,7 +1075,7 @@ struct llama_model {
10761075

10771076
std::string name = "n/a";
10781077

1079-
llama_hparams hparams;
1078+
llama_hparams hparams = {};
10801079
llama_vocab vocab;
10811080

10821081
struct ggml_tensor * tok_embeddings;
@@ -1674,28 +1673,17 @@ static void llm_load_hparams(
16741673
hparams.n_head_kv = hparams.n_head;
16751674
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
16761675

1677-
// TODO: manually setting rope freq base and scale should override this
1678-
// FIXME: partial fix when the param specified is not the default value, but
1679-
// will not work for overriding the model value to the params default
1680-
1681-
llama_context_params defaults = llama_context_default_params();
1682-
1683-
// rope_freq_base
1684-
{
1685-
float ropebase = 10000.0f;
1686-
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1687-
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
1688-
rope_freq_base = ropebase;
1689-
}
1676+
// rope_freq_base (optional)
1677+
if (rope_freq_base == 0.0f) {
1678+
rope_freq_base = 10000.0f;
1679+
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
16901680
}
16911681

16921682
// rope_freq_scale (inverse of the kv) is optional
1693-
{
1683+
if (rope_freq_scale == 0.0f) {
16941684
float ropescale = 1.0f;
16951685
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1696-
if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
1697-
rope_freq_scale = 1.0f/ropescale;
1698-
}
1686+
rope_freq_scale = 1.0f/ropescale;
16991687
}
17001688

17011689
// sanity check for n_rot (optional)
@@ -6188,8 +6176,8 @@ struct llama_context_params llama_context_default_params() {
61886176
/*.n_gpu_layers =*/ 0,
61896177
/*.main_gpu =*/ 0,
61906178
/*.tensor_split =*/ nullptr,
6191-
/*.rope_freq_base =*/ 10000.0f,
6192-
/*.rope_freq_scale =*/ 1.0f,
6179+
/*.rope_freq_base =*/ 0.0f,
6180+
/*.rope_freq_scale =*/ 0.0f,
61936181
/*.progress_callback =*/ nullptr,
61946182
/*.progress_callback_user_data =*/ nullptr,
61956183
/*.low_vram =*/ false,

0 commit comments

Comments
 (0)