switched to NTK aware scaling

LostRuins · LostRuins · commit e17c8497cff7 · 2023-07-02T17:25:08.000+08:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -2223,10 +2223,10 @@ inline void ggml_cuda_op_rope(
     const int n_ctx  = ((int32_t *) src1->data)[3];
     GGML_ASSERT(mode == 0);
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
     const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
 
-    const float p = n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx;
+    const float p = p0;
 
     // compute
     rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
diff --git a/ggml.c b/ggml.c
@@ -4242,6 +4242,22 @@ static inline int ggml_up(int n, int m) {
 #define ggml_assert_aligned(ptr) \
     GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
 
+float get_theta_scale(int n_dims,int n_past,int n_ctx)
+{
+   if(n_ctx<=2048) //normie mode
+   {
+        return powf(10000.0, -2.0f/n_dims);
+   }
+   else
+   {
+       //using scaled NTK aware ctx
+       float a = (n_ctx<=4096?4.0:8.0);
+       float m = powf(a, n_dims / (n_dims - 2.0));
+       float s = powf(10000.0 * m, -2.0f/n_dims);
+       return s;
+   }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 struct ggml_context * ggml_init(struct ggml_init_params params) {
@@ -12531,7 +12547,7 @@ static void ggml_compute_forward_rope_f32(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12571,9 +12587,7 @@ static void ggml_compute_forward_rope_f32(
                         dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
                     }
                 } else if (!is_neox) {
-                    if (n_ctx > GGML_TRAINING_CTX) {
-                        theta = theta * GGML_TRAINING_CTX / n_ctx;
-                    }
+
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
@@ -12674,7 +12688,7 @@ static void ggml_compute_forward_rope_f16(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12714,9 +12728,6 @@ static void ggml_compute_forward_rope_f16(
                         dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
                     }
                 } if (!is_neox) {
-                    if (n_ctx > GGML_TRAINING_CTX) {
-                        theta = theta * GGML_TRAINING_CTX / n_ctx;
-                    }
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
@@ -12842,7 +12853,7 @@ static void ggml_compute_forward_rope_back_f32(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
 
     const bool is_neox = mode & 2;
 
@@ -12856,9 +12867,6 @@ static void ggml_compute_forward_rope_back_f32(
                 float theta = (float)p;
 
                 if (!is_neox) {
-                    if (n_ctx > GGML_TRAINING_CTX) {
-                        theta = theta * GGML_TRAINING_CTX / n_ctx;
-                    }
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
@@ -12959,7 +12967,7 @@ static void ggml_compute_forward_rope_back_f16(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
 
     const bool is_neox = mode & 2;
 
@@ -12973,9 +12981,6 @@ static void ggml_compute_forward_rope_back_f16(
                 float theta = (float)p;
 
                 if (!is_neox) {
-                    if (n_ctx > GGML_TRAINING_CTX) {
-                        theta = theta * GGML_TRAINING_CTX / n_ctx;
-                    }
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
diff --git a/ggml.h b/ggml.h
@@ -201,12 +201,6 @@
 #define GGML_MAX_NAME          48
 #define GGML_DEFAULT_N_THREADS 4
 
-// Maximum training context of the model in use
-// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
-#ifndef GGML_TRAINING_CTX
-#define GGML_TRAINING_CTX 2176
-#endif
-
 #define GGML_ASSERT(x) \
     do { \
         if (!(x)) { \
@@ -510,6 +504,8 @@ extern "C" {
     // use this to compute the memory overhead of a tensor
     GGML_API size_t ggml_tensor_overhead(void);
 
+    GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
+
     // main
 
     GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
diff --git a/llama.cpp b/llama.cpp
@@ -2633,7 +2633,7 @@ struct llama_context * llama_new_context_with_model(
 
         ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
 
-        const size_t bigctxmul = (hparams.n_ctx>2048?2:1);
+        const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
         ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
     }

Original file line number	Diff line number	Diff line change
`@@ -2633,7 +2633,7 @@ struct llama_context * llama_new_context_with_model(`
`2633`	`2633`
`2634`	`2634`	`ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));`
`2635`	`2635`
`2636`		`- const size_t bigctxmul = (hparams.n_ctx>2048?2:1);`
	`2636`	`+ const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));`
`2637`	`2637`	`ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);`
`2638`	`2638`	`ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);`
`2639`	`2639`	`}`