some updates

Judd · Judd · commit 94b15b0f04b6 · 2024-08-28T22:39:57.000+08:00
diff --git a/src/backend.cpp b/src/backend.cpp
@@ -60,26 +60,60 @@ namespace chatllm
     LayerBufAllocator::LayerBufAllocator(ggml_backend_allocator alloc, Backend *backend): LayerBufAllocator(alloc, alloc, backend) {}
     LayerBufAllocator::LayerBufAllocator(ggml_backend_allocator alloc_matrix, ggml_backend_allocator alloc_others, Backend *backend)
         : alloc_matrix(alloc_matrix), alloc_others(alloc_others), backend(backend)
-    {}
+    {
+        CHATLLM_CHECK(alloc_matrix == alloc_others) << " TODO: alloc_matrix must be alloc_others now.";
+    }
 
     BackendBuffer *LayerBufAllocator::alloc(size_t size, Usage usage)
     {
         total += size;
-        ggml_backend_buffer_t buf = nullptr;
+        ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(get_allocator(usage), size);
+
+        CHATLLM_CHECK(buf) << __FUNCTION__ << "() failed to allocate buffer";
+
+        auto r = new BackendBuffer(buf);
+        buffers.emplace_back(r);
+        return r;
+    }
+
+    bool LayerBufAllocator::alloc(ggml::tensor *tensor)
+    {
+        BackendBuffer *buf = alloc(ggml::nbytes(tensor), detect_usage(tensor));
+        if (nullptr == buf) return false;
+
+        buf->assign_to(tensor);
+        return true;
+    }
+
+    bool LayerBufAllocator::supported_by_backend(Backend *backend, ggml::tensor *tensor)
+    {
+        ggml_backend_allocator allocator = get_allocator(tensor); return false;
+        return ggml_backend_supports_buft(backend->backend, allocator);
+    }
+
+    BackendBufAllocator::Usage LayerBufAllocator::detect_usage(ggml::tensor *tensor)
+    {
+        int dims = ggml::n_dims(tensor);
+        return dims >= 2 ? Usage::Matrix : Usage::Others;
+    }
+
+    ggml_backend_allocator LayerBufAllocator::get_allocator(Usage usage)
+    {
         switch (usage)
         {
         case Usage::Matrix:
-            buf = ggml_backend_buft_alloc_buffer(alloc_matrix, size);
-            break;
+            return alloc_matrix;
         case Usage::Others:
-            buf = ggml_backend_buft_alloc_buffer(alloc_others, size);
-            break;
+            return alloc_others;
+        default:
+            CHATLLM_CHECK(false);
+            return nullptr;
         }
-        CHATLLM_CHECK(buf) << __FUNCTION__ << "() failed to allocate buffer";
+    }
 
-        auto r = new BackendBuffer(buf);
-        buffers.emplace_back(r);
-        return r;
+    ggml_backend_allocator LayerBufAllocator::get_allocator(ggml::tensor *tensor)
+    {
+        return get_allocator(detect_usage(tensor));
     }
 
     size_t LayerBufAllocator::get_alignment(Usage usage) const
@@ -377,7 +411,7 @@ namespace chatllm
         for (auto &cfg : gpu_cfgs) n_gpu_layers += cfg.n_layers;
         const bool use_gpu = n_gpu_layers > 0;
 
-        buf_compute_meta.resize(ggml_tensor_overhead()* graph_max_nodes_num + ggml_graph_overhead_custom(graph_max_nodes_num, false));
+        buf_compute_meta.resize(ggml_tensor_overhead() * graph_max_nodes_num + ggml_graph_overhead_custom(graph_max_nodes_num, false));
 
         backend_cpu = ggml_backend_cpu_init();
         CHATLLM_CHECK(backend_cpu != nullptr) << __func__ << ": failed to initialize CPU backend";
@@ -409,7 +443,6 @@ namespace chatllm
     #elif defined(GGML_USE_CUDA) ||  defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) || defined(GGML_USE_CANN)
         if (use_gpu)
         {
-            const int total = ComputeManager::get_device_count();
             for (auto cfg : gpu_cfgs)
             {
                 int device = cfg.id >= 0 ? cfg.id : 0;
@@ -466,9 +499,9 @@ namespace chatllm
         return ggml_backend_sched_reserve(sched, gf);
     }
 
-    void BackendContext::alloc_graph(ggml_cgraph *gf)
+    bool BackendContext::alloc_graph(ggml_cgraph *gf)
     {
-        ggml_backend_sched_alloc_graph(sched, gf);
+        return ggml_backend_sched_alloc_graph(sched, gf);
     }
 
     void BackendContext::compute_graph(ggml_cgraph *gf, int n_threads)
@@ -538,11 +571,6 @@ namespace chatllm
 
     void ComputeContext::cb_op_tensor(ggml::tensor *tensor)
     {
-        if (get_sched() && get_backend())
-        {
-            if (ggml_backend_supports_op(get_backend()->backend, tensor) || ggml_backend_offload_op(get_backend()->backend, tensor))
-                ggml_backend_sched_set_tensor_backend(get_sched(), tensor, get_backend()->backend);
-        }
     }
 
     ggml_backend_sched_t ComputeContext::get_sched(void)
@@ -570,9 +598,9 @@ namespace chatllm
         backend_context->compute_graph(get_cgraph(), n_threads);
     }
 
-    void ComputeContext::allocate(void)
+    bool ComputeContext::allocate(void)
     {
-        backend_context->alloc_graph(get_cgraph());
+        return backend_context->alloc_graph(get_cgraph());
     }
 
     bool ComputeContext::reserve_memory(void)
diff --git a/src/backend.h b/src/backend.h
@@ -51,6 +51,8 @@ namespace chatllm
         ggml_backend_buffer_t buf;
     };
 
+    class Backend;
+
     class BackendBufAllocator
     {
     public:
@@ -61,15 +63,20 @@ namespace chatllm
         };
 
         virtual BackendBuffer *alloc(size_t size, Usage usage = Usage::Others) = 0;
+        virtual bool alloc(ggml::tensor *tensor) = 0;
+
         virtual size_t get_alignment(Usage usage) const = 0;
         virtual size_t get_max_size(Usage usage) const = 0;
 
+        virtual bool supported_by_backend(Backend *backend, ggml::tensor *tensor)
+        {
+            return false;
+        }
+
     protected:
         size_t total = 0;
     };
 
-    class Backend;
-
     class LayerBufAllocator : public BackendBufAllocator
     {
     public:
@@ -80,6 +87,9 @@ namespace chatllm
         LayerBufAllocator(ggml_backend_allocator alloc_matrix, ggml_backend_allocator alloc_others, Backend *backend);
 
         BackendBuffer *alloc(size_t size, Usage usage = Usage::Others) override;
+        bool alloc(ggml::tensor *tensor) override;
+
+        bool supported_by_backend(Backend *backend, ggml::tensor *tensor) override;
 
         size_t get_alignment(Usage usage) const override;
 
@@ -91,6 +101,11 @@ namespace chatllm
 
         bool operator ==(const LayerBufAllocator &b);
 
+    protected:
+        Usage detect_usage(ggml::tensor *tensor);
+        ggml_backend_allocator get_allocator(Usage usage);
+        ggml_backend_allocator get_allocator(ggml::tensor *tensor);
+
     protected:
         ggml_backend_allocator alloc_matrix;
         ggml_backend_allocator alloc_others;
@@ -199,7 +214,7 @@ namespace chatllm
 
         bool reserve_memory(ggml_cgraph *gf);
 
-        void alloc_graph(ggml_cgraph *gf);
+        bool alloc_graph(ggml_cgraph *gf);
 
         void compute_graph(ggml_cgraph *gf, int n_threads);
 
@@ -245,8 +260,6 @@ namespace chatllm
         virtual void cb_new_tensor(ggml::tensor *tensor);
         virtual void cb_op_tensor(ggml::tensor *tensor);
 
-        virtual ggml_backend_sched_t get_sched(void);
-
         virtual void move_to_layer(int layer_id);
 
         BackendBufAllocator *get_allocator(void);
@@ -255,7 +268,7 @@ namespace chatllm
 
         virtual void compute(int n_threads);
 
-        virtual void allocate(void);
+        virtual bool allocate(void);
 
         virtual bool reserve_memory(void);
 
@@ -265,6 +278,8 @@ namespace chatllm
         virtual size_t get_mem_size(void);
 
     protected:
+        virtual ggml_backend_sched_t get_sched(void);
+
         BackendContext *backend_context;
     public:
         // obsoleted
diff --git a/src/layers.cpp b/src/layers.cpp
@@ -507,12 +507,12 @@ namespace chatllm
         return output;
     }
 
-    void fill_pos_vector(ggml::tensor *pos, int n_past, int qlen)
+    void fill_pos_vector(ComputeContext *ctx, std::vector<int> &v_pos, ggml::tensor *pos, int n_past, int qlen)
     {
-        int *p = (int *)pos->data;
         for (int i = 0; i < qlen; i++)
-            p[i] = n_past + i;
+            v_pos[i] = n_past + i;
         pos->ne[0] = qlen;
+        Backend::write_tensor_data(pos, v_pos.data(), 0, qlen * sizeof(v_pos[0]));
     }
 
     ggml::tensor *GLMSelfAttention::forward(ComputeContext *ctx, ggml::tensor *hidden_states, int n_past)
@@ -521,7 +521,7 @@ namespace chatllm
         int qlen = (int)hidden_states->ne[1];
         int head_size = hidden_size / num_attention_heads;
         int rope_dim = head_size / 2;
-        fill_pos_vector(pos, n_past, qlen);
+        fill_pos_vector(ctx, v_pos, pos, n_past, qlen);
 
         if (shift_pending.shift > 0)
         {
@@ -782,7 +782,7 @@ namespace chatllm
 
     void CoreAttention::before_forward(ComputeContext *ctx, const int n_past, const int qlen)
     {
-        fill_pos_vector(pos, n_past, qlen);
+        fill_pos_vector(ctx, v_pos, pos, n_past, qlen);
     }
 
     void KVCacheAttention::before_forward(ComputeContext *ctx, const int n_past, const int qlen)
diff --git a/src/layers.h b/src/layers.h
@@ -315,11 +315,13 @@ namespace chatllm
               ln(ctx, embedding_dim),
               pad_index(2)
         {
-            // TODO:
-            indices->data = new char[ggml::nbytes(indices)];
-            int32_t *p = (int32_t *)indices->data;
+            std::vector<int> v_indices;
+            v_indices.resize(pos_max);
             for (int i = 0; i < pos_max; i++)
-                p[i] = i;
+                v_indices[i] = i;
+
+            ctx->get_allocator()->alloc(indices);
+            Backend::write_tensor_data(indices, v_indices.data());
         }
 
         using Block::forward;
@@ -427,8 +429,9 @@ namespace chatllm
               n_ctx(0),
               shift_pending()
         {
-            // TODO
-            pos->data = new char[ggml::nbytes(pos)]();
+            v_pos.resize(max_length);
+
+            ctx->get_allocator()->alloc(pos);
         }
         using Block::forward;
         ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *hidden_states, int n_past) override;
@@ -468,6 +471,7 @@ namespace chatllm
         int n_ctx;
     private:
         ShiftPending shift_pending;
+        std::vector<int> v_pos;
     };
 
     class GLMBlock : public Block
@@ -841,7 +845,10 @@ namespace chatllm
             {
                 ggml::set_name(v_cache, "v_cache");
             }
-            pos->data = new char[ggml::nbytes(pos)]();
+
+            v_pos.resize(max_length);
+
+            ctx->get_allocator()->alloc(pos);
         }
 
         void shift_cache(int shift, int total) override
@@ -935,6 +942,7 @@ namespace chatllm
         bool attn_scaling;
         bool causal;
         ggml::tensor *last_attn_scores;
+        std::vector<int> v_pos;
     };
 
     class KVCacheAttention : public CoreAttention
@@ -1090,7 +1098,7 @@ namespace chatllm
         ggml::tensor *raw_v;
     };
 
-    void fill_pos_vector(ggml::tensor *pos, int n_past, int qlen);
+    void fill_pos_vector(ComputeContext *ctx, std::vector<int> &v_pos, ggml::tensor *pos, int n_past, int qlen);
 
     // qlen must be 1.
     // This is just a proof of concept.
@@ -1102,15 +1110,17 @@ namespace chatllm
               cache_offset(0),
               indices(ggml::new_tensor_1d(ctx, GGML_TYPE_I32, sliding_window_len))
         {
-            indices->data = new char[ggml::nbytes(indices)];
+            v_indices.resize(sliding_window_len);
+
+            ctx->get_allocator()->alloc(indices);
         }
 
     protected:
         void before_forward(ComputeContext *ctx, const int n_past, const int qlen) override
         {
             if (n_past == 0) cache_offset = 0;
 
-            fill_pos_vector(pos, n_past, qlen);
+            fill_pos_vector(ctx, v_pos, pos, n_past, qlen);
 
             // shift cache
             if (shift_pending.shift > 0)
@@ -1165,9 +1175,10 @@ namespace chatllm
                 const int total = n_past + qlen > cache_length ? cache_length : n_past + qlen;
                 const int start = (cache_offset + n_past + qlen - total) % cache_length;
 
-                int32_t *p = (int32_t *)indices->data;
                 for (int i = 0; i < total; i++)
-                    p[i] = (start + i) % cache_length;
+                    v_indices[i] = (start + i) % cache_length;
+
+                Backend::write_tensor_data(indices, v_indices.data(), 0, total * sizeof(v_indices[0]));
             }
         }
 
@@ -1213,6 +1224,7 @@ namespace chatllm
     public:
         int cache_offset;
         ggml::tensor *indices;
+        std::vector<int> v_indices;
     };
 
     template <int sliding_window_len> class BaseSlidingWindowAttentionFullCache : public BaseAttention
@@ -1300,7 +1312,7 @@ namespace chatllm
         {
             if (n_past == 0) cache_offset = 0;
 
-            fill_pos_vector(pos, n_past, qlen);
+            fill_pos_vector(ctx, v_pos, pos, n_past, qlen);
 
             // shift cache
             if (shift_pending.shift > 0)
diff --git a/src/models.cpp b/src/models.cpp
@@ -991,16 +991,19 @@ namespace chatllm
             }
 
             ForwardContext ctx(&backend_context);
+
             ctx.gctx = GGMLContext({.mem_size = backend_context.buf_compute_meta.size(), .mem_buffer = backend_context.buf_compute_meta.data(), .no_alloc = true});
-            int n_threads = gen_config.num_threads;
             ctx.gf = ggml::new_graph_custom(&ctx, GRAPH_SIZE, false);
 
             dbg_ctx = &ctx;
 
+            ctx.move_to_layer(LayerAllocatorManager::MiscLayer::Prolog);
             ggml::tensor *input_ids_tensor = ggml::new_tensor_1d(&ctx, GGML_TYPE_I32, input_ids.size());
 
             ggml::tensor *r = transformer->forward(&ctx, input_ids_tensor, past);
 
+            ctx.move_to_layer(LayerAllocatorManager::MiscLayer::Epilog);
+
             if (logit_scale > 0)
                 r = ggml::scale_inplace(&ctx, r, logit_scale);
 
@@ -1010,9 +1013,9 @@ namespace chatllm
 
             output.resize(ggml::nbytes(r) / sizeof(output[0]));
 
-            ctx.allocate();
+            CHATLLM_CHECK(ctx.allocate()) << "failed to allocate memory for graph";
             Backend::write_tensor_data(input_ids_tensor, input_ids.data());
-            ctx.compute(n_threads);
+            ctx.compute(gen_config.num_threads);
 
             Backend::read_tensor_data(r, output.data());
 

Original file line number	Diff line number	Diff line change
`@@ -507,12 +507,12 @@ namespace chatllm`
`507`	`507`	`return output;`
`508`	`508`	`}`
`509`	`509`
`510`		`- void fill_pos_vector(ggml::tensor *pos, int n_past, int qlen)`
	`510`	`+ void fill_pos_vector(ComputeContext ctx, std::vector<int> &v_pos, ggml::tensor pos, int n_past, int qlen)`
`511`	`511`	`{`
`512`		`- int p = (int )pos->data;`
`513`	`512`	`for (int i = 0; i < qlen; i++)`
`514`		`- p[i] = n_past + i;`
	`513`	`+ v_pos[i] = n_past + i;`
`515`	`514`	`pos->ne[0] = qlen;`
	`515`	`+ Backend::write_tensor_data(pos, v_pos.data(), 0, qlen * sizeof(v_pos[0]));`
`516`	`516`	`}`
`517`	`517`
`518`	`518`	`ggml::tensor GLMSelfAttention::forward(ComputeContext ctx, ggml::tensor *hidden_states, int n_past)`
`@@ -521,7 +521,7 @@ namespace chatllm`
`521`	`521`	`int qlen = (int)hidden_states->ne[1];`
`522`	`522`	`int head_size = hidden_size / num_attention_heads;`
`523`	`523`	`int rope_dim = head_size / 2;`
`524`		`- fill_pos_vector(pos, n_past, qlen);`
	`524`	`+ fill_pos_vector(ctx, v_pos, pos, n_past, qlen);`
`525`	`525`
`526`	`526`	`if (shift_pending.shift > 0)`
`527`	`527`	`{`
`@@ -782,7 +782,7 @@ namespace chatllm`
`782`	`782`
`783`	`783`	`void CoreAttention::before_forward(ComputeContext *ctx, const int n_past, const int qlen)`
`784`	`784`	`{`
`785`		`- fill_pos_vector(pos, n_past, qlen);`
	`785`	`+ fill_pos_vector(ctx, v_pos, pos, n_past, qlen);`
`786`	`786`	`}`
`787`	`787`
`788`	`788`	`void KVCacheAttention::before_forward(ComputeContext *ctx, const int n_past, const int qlen)`