Skip to content

Commit 94b15b0

Browse files
author
Judd
committed
some updates
1 parent 8a2b085 commit 94b15b0

File tree

5 files changed

+106
-48
lines changed

5 files changed

+106
-48
lines changed

src/backend.cpp

Lines changed: 49 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -60,26 +60,60 @@ namespace chatllm
6060
LayerBufAllocator::LayerBufAllocator(ggml_backend_allocator alloc, Backend *backend): LayerBufAllocator(alloc, alloc, backend) {}
6161
LayerBufAllocator::LayerBufAllocator(ggml_backend_allocator alloc_matrix, ggml_backend_allocator alloc_others, Backend *backend)
6262
: alloc_matrix(alloc_matrix), alloc_others(alloc_others), backend(backend)
63-
{}
63+
{
64+
CHATLLM_CHECK(alloc_matrix == alloc_others) << " TODO: alloc_matrix must be alloc_others now.";
65+
}
6466

6567
BackendBuffer *LayerBufAllocator::alloc(size_t size, Usage usage)
6668
{
6769
total += size;
68-
ggml_backend_buffer_t buf = nullptr;
70+
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(get_allocator(usage), size);
71+
72+
CHATLLM_CHECK(buf) << __FUNCTION__ << "() failed to allocate buffer";
73+
74+
auto r = new BackendBuffer(buf);
75+
buffers.emplace_back(r);
76+
return r;
77+
}
78+
79+
bool LayerBufAllocator::alloc(ggml::tensor *tensor)
80+
{
81+
BackendBuffer *buf = alloc(ggml::nbytes(tensor), detect_usage(tensor));
82+
if (nullptr == buf) return false;
83+
84+
buf->assign_to(tensor);
85+
return true;
86+
}
87+
88+
bool LayerBufAllocator::supported_by_backend(Backend *backend, ggml::tensor *tensor)
89+
{
90+
ggml_backend_allocator allocator = get_allocator(tensor); return false;
91+
return ggml_backend_supports_buft(backend->backend, allocator);
92+
}
93+
94+
BackendBufAllocator::Usage LayerBufAllocator::detect_usage(ggml::tensor *tensor)
95+
{
96+
int dims = ggml::n_dims(tensor);
97+
return dims >= 2 ? Usage::Matrix : Usage::Others;
98+
}
99+
100+
ggml_backend_allocator LayerBufAllocator::get_allocator(Usage usage)
101+
{
69102
switch (usage)
70103
{
71104
case Usage::Matrix:
72-
buf = ggml_backend_buft_alloc_buffer(alloc_matrix, size);
73-
break;
105+
return alloc_matrix;
74106
case Usage::Others:
75-
buf = ggml_backend_buft_alloc_buffer(alloc_others, size);
76-
break;
107+
return alloc_others;
108+
default:
109+
CHATLLM_CHECK(false);
110+
return nullptr;
77111
}
78-
CHATLLM_CHECK(buf) << __FUNCTION__ << "() failed to allocate buffer";
112+
}
79113

80-
auto r = new BackendBuffer(buf);
81-
buffers.emplace_back(r);
82-
return r;
114+
ggml_backend_allocator LayerBufAllocator::get_allocator(ggml::tensor *tensor)
115+
{
116+
return get_allocator(detect_usage(tensor));
83117
}
84118

85119
size_t LayerBufAllocator::get_alignment(Usage usage) const
@@ -377,7 +411,7 @@ namespace chatllm
377411
for (auto &cfg : gpu_cfgs) n_gpu_layers += cfg.n_layers;
378412
const bool use_gpu = n_gpu_layers > 0;
379413

380-
buf_compute_meta.resize(ggml_tensor_overhead()* graph_max_nodes_num + ggml_graph_overhead_custom(graph_max_nodes_num, false));
414+
buf_compute_meta.resize(ggml_tensor_overhead() * graph_max_nodes_num + ggml_graph_overhead_custom(graph_max_nodes_num, false));
381415

382416
backend_cpu = ggml_backend_cpu_init();
383417
CHATLLM_CHECK(backend_cpu != nullptr) << __func__ << ": failed to initialize CPU backend";
@@ -409,7 +443,6 @@ namespace chatllm
409443
#elif defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) || defined(GGML_USE_CANN)
410444
if (use_gpu)
411445
{
412-
const int total = ComputeManager::get_device_count();
413446
for (auto cfg : gpu_cfgs)
414447
{
415448
int device = cfg.id >= 0 ? cfg.id : 0;
@@ -466,9 +499,9 @@ namespace chatllm
466499
return ggml_backend_sched_reserve(sched, gf);
467500
}
468501

469-
void BackendContext::alloc_graph(ggml_cgraph *gf)
502+
bool BackendContext::alloc_graph(ggml_cgraph *gf)
470503
{
471-
ggml_backend_sched_alloc_graph(sched, gf);
504+
return ggml_backend_sched_alloc_graph(sched, gf);
472505
}
473506

474507
void BackendContext::compute_graph(ggml_cgraph *gf, int n_threads)
@@ -538,11 +571,6 @@ namespace chatllm
538571

539572
void ComputeContext::cb_op_tensor(ggml::tensor *tensor)
540573
{
541-
if (get_sched() && get_backend())
542-
{
543-
if (ggml_backend_supports_op(get_backend()->backend, tensor) || ggml_backend_offload_op(get_backend()->backend, tensor))
544-
ggml_backend_sched_set_tensor_backend(get_sched(), tensor, get_backend()->backend);
545-
}
546574
}
547575

548576
ggml_backend_sched_t ComputeContext::get_sched(void)
@@ -570,9 +598,9 @@ namespace chatllm
570598
backend_context->compute_graph(get_cgraph(), n_threads);
571599
}
572600

573-
void ComputeContext::allocate(void)
601+
bool ComputeContext::allocate(void)
574602
{
575-
backend_context->alloc_graph(get_cgraph());
603+
return backend_context->alloc_graph(get_cgraph());
576604
}
577605

578606
bool ComputeContext::reserve_memory(void)

src/backend.h

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ namespace chatllm
5151
ggml_backend_buffer_t buf;
5252
};
5353

54+
class Backend;
55+
5456
class BackendBufAllocator
5557
{
5658
public:
@@ -61,15 +63,20 @@ namespace chatllm
6163
};
6264

6365
virtual BackendBuffer *alloc(size_t size, Usage usage = Usage::Others) = 0;
66+
virtual bool alloc(ggml::tensor *tensor) = 0;
67+
6468
virtual size_t get_alignment(Usage usage) const = 0;
6569
virtual size_t get_max_size(Usage usage) const = 0;
6670

71+
virtual bool supported_by_backend(Backend *backend, ggml::tensor *tensor)
72+
{
73+
return false;
74+
}
75+
6776
protected:
6877
size_t total = 0;
6978
};
7079

71-
class Backend;
72-
7380
class LayerBufAllocator : public BackendBufAllocator
7481
{
7582
public:
@@ -80,6 +87,9 @@ namespace chatllm
8087
LayerBufAllocator(ggml_backend_allocator alloc_matrix, ggml_backend_allocator alloc_others, Backend *backend);
8188

8289
BackendBuffer *alloc(size_t size, Usage usage = Usage::Others) override;
90+
bool alloc(ggml::tensor *tensor) override;
91+
92+
bool supported_by_backend(Backend *backend, ggml::tensor *tensor) override;
8393

8494
size_t get_alignment(Usage usage) const override;
8595

@@ -91,6 +101,11 @@ namespace chatllm
91101

92102
bool operator ==(const LayerBufAllocator &b);
93103

104+
protected:
105+
Usage detect_usage(ggml::tensor *tensor);
106+
ggml_backend_allocator get_allocator(Usage usage);
107+
ggml_backend_allocator get_allocator(ggml::tensor *tensor);
108+
94109
protected:
95110
ggml_backend_allocator alloc_matrix;
96111
ggml_backend_allocator alloc_others;
@@ -199,7 +214,7 @@ namespace chatllm
199214

200215
bool reserve_memory(ggml_cgraph *gf);
201216

202-
void alloc_graph(ggml_cgraph *gf);
217+
bool alloc_graph(ggml_cgraph *gf);
203218

204219
void compute_graph(ggml_cgraph *gf, int n_threads);
205220

@@ -245,8 +260,6 @@ namespace chatllm
245260
virtual void cb_new_tensor(ggml::tensor *tensor);
246261
virtual void cb_op_tensor(ggml::tensor *tensor);
247262

248-
virtual ggml_backend_sched_t get_sched(void);
249-
250263
virtual void move_to_layer(int layer_id);
251264

252265
BackendBufAllocator *get_allocator(void);
@@ -255,7 +268,7 @@ namespace chatllm
255268

256269
virtual void compute(int n_threads);
257270

258-
virtual void allocate(void);
271+
virtual bool allocate(void);
259272

260273
virtual bool reserve_memory(void);
261274

@@ -265,6 +278,8 @@ namespace chatllm
265278
virtual size_t get_mem_size(void);
266279

267280
protected:
281+
virtual ggml_backend_sched_t get_sched(void);
282+
268283
BackendContext *backend_context;
269284
public:
270285
// obsoleted

src/layers.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -507,12 +507,12 @@ namespace chatllm
507507
return output;
508508
}
509509

510-
void fill_pos_vector(ggml::tensor *pos, int n_past, int qlen)
510+
void fill_pos_vector(ComputeContext *ctx, std::vector<int> &v_pos, ggml::tensor *pos, int n_past, int qlen)
511511
{
512-
int *p = (int *)pos->data;
513512
for (int i = 0; i < qlen; i++)
514-
p[i] = n_past + i;
513+
v_pos[i] = n_past + i;
515514
pos->ne[0] = qlen;
515+
Backend::write_tensor_data(pos, v_pos.data(), 0, qlen * sizeof(v_pos[0]));
516516
}
517517

518518
ggml::tensor *GLMSelfAttention::forward(ComputeContext *ctx, ggml::tensor *hidden_states, int n_past)
@@ -521,7 +521,7 @@ namespace chatllm
521521
int qlen = (int)hidden_states->ne[1];
522522
int head_size = hidden_size / num_attention_heads;
523523
int rope_dim = head_size / 2;
524-
fill_pos_vector(pos, n_past, qlen);
524+
fill_pos_vector(ctx, v_pos, pos, n_past, qlen);
525525

526526
if (shift_pending.shift > 0)
527527
{
@@ -782,7 +782,7 @@ namespace chatllm
782782

783783
void CoreAttention::before_forward(ComputeContext *ctx, const int n_past, const int qlen)
784784
{
785-
fill_pos_vector(pos, n_past, qlen);
785+
fill_pos_vector(ctx, v_pos, pos, n_past, qlen);
786786
}
787787

788788
void KVCacheAttention::before_forward(ComputeContext *ctx, const int n_past, const int qlen)

src/layers.h

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -315,11 +315,13 @@ namespace chatllm
315315
ln(ctx, embedding_dim),
316316
pad_index(2)
317317
{
318-
// TODO:
319-
indices->data = new char[ggml::nbytes(indices)];
320-
int32_t *p = (int32_t *)indices->data;
318+
std::vector<int> v_indices;
319+
v_indices.resize(pos_max);
321320
for (int i = 0; i < pos_max; i++)
322-
p[i] = i;
321+
v_indices[i] = i;
322+
323+
ctx->get_allocator()->alloc(indices);
324+
Backend::write_tensor_data(indices, v_indices.data());
323325
}
324326

325327
using Block::forward;
@@ -427,8 +429,9 @@ namespace chatllm
427429
n_ctx(0),
428430
shift_pending()
429431
{
430-
// TODO
431-
pos->data = new char[ggml::nbytes(pos)]();
432+
v_pos.resize(max_length);
433+
434+
ctx->get_allocator()->alloc(pos);
432435
}
433436
using Block::forward;
434437
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *hidden_states, int n_past) override;
@@ -468,6 +471,7 @@ namespace chatllm
468471
int n_ctx;
469472
private:
470473
ShiftPending shift_pending;
474+
std::vector<int> v_pos;
471475
};
472476

473477
class GLMBlock : public Block
@@ -841,7 +845,10 @@ namespace chatllm
841845
{
842846
ggml::set_name(v_cache, "v_cache");
843847
}
844-
pos->data = new char[ggml::nbytes(pos)]();
848+
849+
v_pos.resize(max_length);
850+
851+
ctx->get_allocator()->alloc(pos);
845852
}
846853

847854
void shift_cache(int shift, int total) override
@@ -935,6 +942,7 @@ namespace chatllm
935942
bool attn_scaling;
936943
bool causal;
937944
ggml::tensor *last_attn_scores;
945+
std::vector<int> v_pos;
938946
};
939947

940948
class KVCacheAttention : public CoreAttention
@@ -1090,7 +1098,7 @@ namespace chatllm
10901098
ggml::tensor *raw_v;
10911099
};
10921100

1093-
void fill_pos_vector(ggml::tensor *pos, int n_past, int qlen);
1101+
void fill_pos_vector(ComputeContext *ctx, std::vector<int> &v_pos, ggml::tensor *pos, int n_past, int qlen);
10941102

10951103
// qlen must be 1.
10961104
// This is just a proof of concept.
@@ -1102,15 +1110,17 @@ namespace chatllm
11021110
cache_offset(0),
11031111
indices(ggml::new_tensor_1d(ctx, GGML_TYPE_I32, sliding_window_len))
11041112
{
1105-
indices->data = new char[ggml::nbytes(indices)];
1113+
v_indices.resize(sliding_window_len);
1114+
1115+
ctx->get_allocator()->alloc(indices);
11061116
}
11071117

11081118
protected:
11091119
void before_forward(ComputeContext *ctx, const int n_past, const int qlen) override
11101120
{
11111121
if (n_past == 0) cache_offset = 0;
11121122

1113-
fill_pos_vector(pos, n_past, qlen);
1123+
fill_pos_vector(ctx, v_pos, pos, n_past, qlen);
11141124

11151125
// shift cache
11161126
if (shift_pending.shift > 0)
@@ -1165,9 +1175,10 @@ namespace chatllm
11651175
const int total = n_past + qlen > cache_length ? cache_length : n_past + qlen;
11661176
const int start = (cache_offset + n_past + qlen - total) % cache_length;
11671177

1168-
int32_t *p = (int32_t *)indices->data;
11691178
for (int i = 0; i < total; i++)
1170-
p[i] = (start + i) % cache_length;
1179+
v_indices[i] = (start + i) % cache_length;
1180+
1181+
Backend::write_tensor_data(indices, v_indices.data(), 0, total * sizeof(v_indices[0]));
11711182
}
11721183
}
11731184

@@ -1213,6 +1224,7 @@ namespace chatllm
12131224
public:
12141225
int cache_offset;
12151226
ggml::tensor *indices;
1227+
std::vector<int> v_indices;
12161228
};
12171229

12181230
template <int sliding_window_len> class BaseSlidingWindowAttentionFullCache : public BaseAttention
@@ -1300,7 +1312,7 @@ namespace chatllm
13001312
{
13011313
if (n_past == 0) cache_offset = 0;
13021314

1303-
fill_pos_vector(pos, n_past, qlen);
1315+
fill_pos_vector(ctx, v_pos, pos, n_past, qlen);
13041316

13051317
// shift cache
13061318
if (shift_pending.shift > 0)

src/models.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -991,16 +991,19 @@ namespace chatllm
991991
}
992992

993993
ForwardContext ctx(&backend_context);
994+
994995
ctx.gctx = GGMLContext({.mem_size = backend_context.buf_compute_meta.size(), .mem_buffer = backend_context.buf_compute_meta.data(), .no_alloc = true});
995-
int n_threads = gen_config.num_threads;
996996
ctx.gf = ggml::new_graph_custom(&ctx, GRAPH_SIZE, false);
997997

998998
dbg_ctx = &ctx;
999999

1000+
ctx.move_to_layer(LayerAllocatorManager::MiscLayer::Prolog);
10001001
ggml::tensor *input_ids_tensor = ggml::new_tensor_1d(&ctx, GGML_TYPE_I32, input_ids.size());
10011002

10021003
ggml::tensor *r = transformer->forward(&ctx, input_ids_tensor, past);
10031004

1005+
ctx.move_to_layer(LayerAllocatorManager::MiscLayer::Epilog);
1006+
10041007
if (logit_scale > 0)
10051008
r = ggml::scale_inplace(&ctx, r, logit_scale);
10061009

@@ -1010,9 +1013,9 @@ namespace chatllm
10101013

10111014
output.resize(ggml::nbytes(r) / sizeof(output[0]));
10121015

1013-
ctx.allocate();
1016+
CHATLLM_CHECK(ctx.allocate()) << "failed to allocate memory for graph";
10141017
Backend::write_tensor_data(input_ids_tensor, input_ids.data());
1015-
ctx.compute(n_threads);
1018+
ctx.compute(gen_config.num_threads);
10161019

10171020
Backend::read_tensor_data(r, output.data());
10181021

0 commit comments

Comments
 (0)