Some random ctx/eps stuff I pulled out from my fork

eousphoros · eousphoros · commit e0213e08c03a · 2023-03-13T13:49:46.000-07:00
diff --git a/Makefile b/Makefile
@@ -30,9 +30,9 @@ endif
 # Compile flags
 #
 
-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
-LDFLAGS  =
+CFLAGS   = -I.              -O2 -DNDEBUG -std=c11   -flto -fPIC
+CXXFLAGS = -I. -I./examples -O2 -DNDEBUG -std=c++11 -flto -fPIC
+LDFLAGS  = -flto -fPIC
 
 # OS specific
 # TODO: support Windows
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
@@ -127,10 +127,6 @@ def get_n_parts(dim):
         name = k
         shape = v.shape
 
-        # skip layers.X.attention.inner_attention.rope.freqs
-        if name[-5:] == "freqs":
-            continue
-
         print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
 
         #data = tf.train.load_variable(dir_model, name).squeeze()
@@ -169,7 +165,7 @@ def get_n_parts(dim):
         data.tofile(fout)
 
     # I hope this deallocates the memory ..
-    model = None
+    del model
 
     fout.close()
 
diff --git a/ggml.c b/ggml.c
@@ -2145,6 +2145,7 @@ struct ggml_context {
     bool   mem_buffer_owned;
 
     int n_objects;
+    float_t eps;
 
     struct ggml_object * objects_begin;
     struct ggml_object * objects_end;
@@ -2159,26 +2160,6 @@ struct ggml_context_container {
     struct ggml_context context;
 };
 
-//
-// compute types
-//
-
-enum ggml_task_type {
-    GGML_TASK_INIT = 0,
-    GGML_TASK_COMPUTE,
-    GGML_TASK_FINALIZE,
-};
-
-struct ggml_compute_params {
-    enum ggml_task_type type;
-
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-};
-
 //
 // ggml state
 //
@@ -2422,6 +2403,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         /*.mem_buffer       =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
         /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
         /*.n_objects        =*/ 0,
+        /*.eps              =*/ params.eps,
         /*.objects_begin    =*/ NULL,
         /*.objects_end      =*/ NULL,
         /*.scratch          =*/ { 0, 0, NULL, },
@@ -5335,7 +5317,8 @@ static void ggml_compute_forward_norm_f32(
     const size_t nb2 = dst->nb[2];
     const size_t nb3 = dst->nb[3];
 
-    const ggml_float eps = 1e-5f; // TODO: make this a parameter
+    // if params->eps is zero, use default of 1e-6 otherwise use params->eps
+    const float eps = params->eps == 0.0f ? 1e-6f : params->eps;
 
     // TODO: optimize
     for (int i03 = 0; i03 < ne03; i03++) {
@@ -9378,6 +9361,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
             /*.type  =*/ GGML_TASK_INIT,
             /*.ith   =*/ 0,
             /*.nth   =*/ node->n_tasks,
+            /*.eps   =*/ ctx->eps,
             /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
             /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
         };
diff --git a/ggml.h b/ggml.h
@@ -314,9 +314,36 @@ struct ggml_scratch {
 struct ggml_init_params {
     // memory pool
     size_t mem_size;   // bytes
+
+    // eps
+    float eps;
+
+    // work buffer
     void * mem_buffer; // if NULL, memory will be allocated internally
 };
 
+//
+// compute types
+//
+
+enum ggml_task_type {
+    GGML_TASK_INIT = 0,
+    GGML_TASK_COMPUTE,
+    GGML_TASK_FINALIZE,
+};
+
+struct ggml_compute_params {
+    enum ggml_task_type type;
+
+    int ith, nth;
+
+    float eps;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+};
+
 void    ggml_time_init(void); // call this once at the beginning of the program
 int64_t ggml_time_ms(void);
 int64_t ggml_time_us(void);
@@ -477,7 +504,6 @@ struct ggml_tensor * ggml_silu(
         struct ggml_tensor  * a);
 
 // normalize along rows
-// TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
@@ -585,6 +611,12 @@ struct ggml_tensor * ggml_rope(
         int                   n_past,
         int                   n_dims,
         int                   mode);
+        
+static void ggml_compute_forward_rope(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst);
 
 // padding = 1
 // TODO: we don't support extra parameters for now
diff --git a/main.cpp b/main.cpp
@@ -62,6 +62,7 @@ struct llama_layer {
     struct ggml_tensor * w1;
     struct ggml_tensor * w2;
     struct ggml_tensor * w3;
+
 };
 
 struct llama_model {
@@ -72,6 +73,9 @@ struct llama_model {
     struct ggml_tensor * norm;
     struct ggml_tensor * output;
 
+    // rope frequencies
+    struct ggml_tensor * rope_freqs;
+
     std::vector<llama_layer> layers;
 
     // key + value memory
@@ -215,7 +219,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
 
-        ctx_size += (5 + 10*n_layer)*256; // object overhead
+        ctx_size += (5 + 10*n_layer)*hparams.n_ctx; // object overhead
 
         printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
@@ -224,6 +228,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
     {
         struct ggml_init_params params = {
             /*.mem_size   =*/ ctx_size,
+            /*.eps        =*/ 1e-6, // change to 1e-5 for 7/13B models
             /*.mem_buffer =*/ NULL,
         };
 
@@ -286,6 +291,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
             model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
         }
+
+        model.rope_freqs = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
+        model.tensors["rope.freqs"] = model.rope_freqs;
+
     }
 
     // key + value memory
@@ -543,7 +552,8 @@ bool llama_eval(
 
     const int d_key = n_embd/n_head;
 
-    static size_t buf_size = 512u*1024*1024;
+    // allocate memory
+    static size_t buf_size = n_ctx*1024*1024;
     static void * buf = malloc(buf_size);
 
     if (mem_per_token > 0 && mem_per_token*N > buf_size) {
@@ -561,6 +571,7 @@ bool llama_eval(
 
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_size,
+        /*.eps =*/        1e-6, // change to 1e-5 for for 7/13B models
         /*.mem_buffer =*/ buf,
     };
 
@@ -603,6 +614,7 @@ bool llama_eval(
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
             }
 
+            // Apply rotary embeddings
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
             struct ggml_tensor * Q =
                 ggml_permute(ctx0,
@@ -633,6 +645,7 @@ bool llama_eval(
                         ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
                         );
 
+            // Scoring
             // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
 
@@ -658,6 +671,7 @@ bool llama_eval(
                     KQV_merged,
                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
 
+            // attention.wo(output)
             // projection (no bias)
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].wo,
@@ -795,7 +809,7 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if (!llama_model_load(params.model, model, vocab, 512)) {  // TODO: set context from user input ??
+        if (!llama_model_load(params.model, model, vocab, 1024)) {  // TODO: set context from user input ??
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
             return 1;
         }
diff --git a/quantize.cpp b/quantize.cpp
@@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL };
+        struct ggml_init_params params = { 0, 1e-6, NULL };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }

Original file line number	Diff line number	Diff line change
`@@ -299,7 +299,7 @@ int main(int argc, char ** argv) {`
`299`	`299`
`300`	`300`	`// needed to initialize f16 tables`
`301`	`301`	`{`
`302`		`- struct ggml_init_params params = { 0, NULL };`
	`302`	`+ struct ggml_init_params params = { 0, 1e-6, NULL };`
`303`	`303`	`struct ggml_context * ctx = ggml_init(params);`
`304`	`304`	`ggml_free(ctx);`
`305`	`305`	`}`