@@ -32,6 +32,309 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
32
32
return relative_bucket;
33
33
}
34
34
35
+ llama_context::llama_context (const llama_model & model, const llama_context_params & params, std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst) :
36
+ model(model),
37
+ t_start_us(model.t_start_us),
38
+ t_load_us (model.t_load_us) {
39
+
40
+ const auto & hparams = model.hparams ;
41
+
42
+ cparams.n_seq_max = std::max (1u , params.n_seq_max );
43
+ cparams.n_threads = params.n_threads ;
44
+ cparams.n_threads_batch = params.n_threads_batch ;
45
+ cparams.yarn_ext_factor = params.yarn_ext_factor ;
46
+ cparams.yarn_attn_factor = params.yarn_attn_factor ;
47
+ cparams.yarn_beta_fast = params.yarn_beta_fast ;
48
+ cparams.yarn_beta_slow = params.yarn_beta_slow ;
49
+ cparams.defrag_thold = params.defrag_thold ;
50
+ cparams.embeddings = params.embeddings ;
51
+ cparams.offload_kqv = params.offload_kqv ;
52
+ cparams.flash_attn = params.flash_attn ;
53
+ cparams.no_perf = params.no_perf ;
54
+ cparams.pooling_type = params.pooling_type ;
55
+
56
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx ;
57
+ cparams.rope_freq_base = params.rope_freq_base == 0 .0f ? hparams.rope_freq_base_train : params.rope_freq_base ;
58
+ cparams.rope_freq_scale = params.rope_freq_scale == 0 .0f ? hparams.rope_freq_scale_train : params.rope_freq_scale ;
59
+
60
+ cparams.n_ctx = GGML_PAD (cparams.n_ctx , get_ctx_padding (cparams));
61
+
62
+ // with causal attention, the batch size is limited by the context size
63
+ cparams.n_batch = hparams.causal_attn ? std::min (cparams.n_ctx , params.n_batch ) : params.n_batch ;
64
+
65
+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
66
+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
67
+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
68
+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
69
+ LLAMA_LOG_WARN (" %s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n " , __func__, GGML_KQ_MASK_PAD);
70
+ cparams.n_batch = GGML_KQ_MASK_PAD;
71
+ }
72
+
73
+ cparams.n_ubatch = std::min (cparams.n_batch , params.n_ubatch == 0 ? params.n_batch : params.n_ubatch );
74
+
75
+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
76
+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
77
+ hparams.n_ctx_train ;
78
+
79
+ cparams.cb_eval = params.cb_eval ;
80
+ cparams.cb_eval_user_data = params.cb_eval_user_data ;
81
+
82
+ auto rope_scaling_type = params.rope_scaling_type ;
83
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
84
+ rope_scaling_type = hparams.rope_scaling_type_train ;
85
+ }
86
+
87
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
88
+ cparams.rope_freq_scale = 1 .0f ; // never scale if scaling type is none
89
+ }
90
+
91
+ if (cparams.yarn_ext_factor < 0 .0f ) { // negative indicates 'not set'
92
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1 .0f : 0 .0f ;
93
+ }
94
+
95
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor ;
96
+
97
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
98
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
99
+ cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
100
+ } else {
101
+ cparams.pooling_type = hparams.pooling_type ;
102
+ }
103
+ }
104
+
105
+ if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
106
+ cparams.causal_attn = hparams.causal_attn ;
107
+ } else {
108
+ cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
109
+ }
110
+
111
+ const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max ;
112
+
113
+ LLAMA_LOG_INFO (" %s: n_seq_max = %u\n " , __func__, cparams.n_seq_max );
114
+ LLAMA_LOG_INFO (" %s: n_ctx = %u\n " , __func__, cparams.n_ctx );
115
+ LLAMA_LOG_INFO (" %s: n_ctx_per_seq = %u\n " , __func__, n_ctx_per_seq);
116
+ LLAMA_LOG_INFO (" %s: n_batch = %u\n " , __func__, cparams.n_batch );
117
+ LLAMA_LOG_INFO (" %s: n_ubatch = %u\n " , __func__, cparams.n_ubatch );
118
+ LLAMA_LOG_INFO (" %s: flash_attn = %d\n " , __func__, cparams.flash_attn );
119
+ LLAMA_LOG_INFO (" %s: freq_base = %.1f\n " , __func__, cparams.rope_freq_base );
120
+ LLAMA_LOG_INFO (" %s: freq_scale = %g\n " , __func__, cparams.rope_freq_scale );
121
+
122
+ if (n_ctx_per_seq < hparams.n_ctx_train ) {
123
+ LLAMA_LOG_WARN (" %s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n " ,
124
+ __func__, n_ctx_per_seq, hparams.n_ctx_train );
125
+ }
126
+
127
+ if (n_ctx_per_seq > hparams.n_ctx_train ) {
128
+ LLAMA_LOG_WARN (" %s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n " ,
129
+ __func__, n_ctx_per_seq, hparams.n_ctx_train );
130
+ }
131
+
132
+ logits_all = params.logits_all ;
133
+
134
+ // build worst-case graph for encoder if a model contains encoder
135
+ is_encoding = llama_model_has_encoder (&model); // TODO: model.has_encoder()
136
+
137
+ uint32_t kv_size = cparams.n_ctx ;
138
+ ggml_type type_k = params.type_k ;
139
+ ggml_type type_v = params.type_v ;
140
+
141
+ // Mamba only needs a constant number of KV cache cells per sequence
142
+ if (llama_model_is_recurrent (&model)) {
143
+ // Mamba needs at least as many KV cells as there are sequences kept at any time
144
+ kv_size = std::max ((uint32_t ) 1 , params.n_seq_max );
145
+ // it's probably best to keep as much precision as possible for the states
146
+ type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
147
+ type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
148
+ }
149
+
150
+ GGML_ASSERT (hparams.n_embd_head_k % ggml_blck_size (type_k) == 0 );
151
+ GGML_ASSERT (hparams.n_embd_head_v % ggml_blck_size (type_v) == 0 );
152
+
153
+ if (!hparams.vocab_only ) {
154
+ // GPU backends
155
+ for (auto * dev : model.devices ) {
156
+ ggml_backend_t backend = ggml_backend_dev_init (dev, nullptr );
157
+ if (backend == nullptr ) {
158
+ LLAMA_LOG_ERROR (" %s: failed to initialize %s backend\n " , __func__, ggml_backend_dev_name (dev));
159
+ throw std::runtime_error (" failed to initialize backend" );
160
+ }
161
+ backends.emplace_back (backend);
162
+ }
163
+
164
+ // add ACCEL backends (such as BLAS)
165
+ for (size_t i = 0 ; i < ggml_backend_dev_count (); ++i) {
166
+ ggml_backend_dev_t dev = ggml_backend_dev_get (i);
167
+ if (ggml_backend_dev_type (dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
168
+ ggml_backend_t backend = ggml_backend_dev_init (dev, nullptr );
169
+ if (backend == nullptr ) {
170
+ LLAMA_LOG_ERROR (" %s: failed to initialize %s backend\n " , __func__, ggml_backend_dev_name (dev));
171
+ throw std::runtime_error (" failed to initialize backend" );
172
+ }
173
+ backends.emplace_back (backend);
174
+ }
175
+ }
176
+
177
+ // add CPU backend
178
+ backend_cpu = ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr );
179
+ if (backend_cpu == nullptr ) {
180
+ LLAMA_LOG_ERROR (" %s: failed to initialize CPU backend\n " , __func__);
181
+ throw std::runtime_error (" failed to initialize CPU backend" );
182
+ }
183
+ backends.emplace_back (backend_cpu);
184
+
185
+ // create a list of the set_n_threads functions in the backends
186
+ for (auto & backend : backends) {
187
+ ggml_backend_dev_t dev = ggml_backend_get_device (backend.get ());
188
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg (dev) : nullptr ;
189
+ if (reg) {
190
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_n_threads" );
191
+ if (ggml_backend_set_n_threads_fn) {
192
+ set_n_threads_fns.emplace_back (backend.get (), ggml_backend_set_n_threads_fn);
193
+ }
194
+ }
195
+ }
196
+
197
+ llama_set_abort_callback (this , params.abort_callback , params.abort_callback_data );
198
+
199
+ if (!kv_self.init (model, cparams, type_k, type_v, kv_size, cparams.offload_kqv )) {
200
+ LLAMA_LOG_ERROR (" %s: llama_kv_cache_init() failed for self-attention cache\n " , __func__);
201
+ throw std::runtime_error (" failed to initialize self-attention cache" );
202
+ }
203
+
204
+ {
205
+ const size_t memory_size_k = kv_self.size_k_bytes ();
206
+ const size_t memory_size_v = kv_self.size_v_bytes ();
207
+
208
+ LLAMA_LOG_INFO (" %s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n " , __func__,
209
+ (float )(memory_size_k + memory_size_v) / (1024 .0f * 1024 .0f ),
210
+ ggml_type_name (type_k), (float )memory_size_k / (1024 .0f * 1024 .0f ),
211
+ ggml_type_name (type_v), (float )memory_size_v / (1024 .0f * 1024 .0f ));
212
+ }
213
+
214
+ // graph outputs buffer
215
+ {
216
+ // resized during inference when a batch uses more outputs
217
+ if (llama_output_reserve (*this , params.n_seq_max ) < params.n_seq_max ) {
218
+ LLAMA_LOG_ERROR (" %s: failed to reserve initial output buffer\n " , __func__);
219
+ throw std::runtime_error (" failed to reserve initial output buffer" );
220
+ }
221
+
222
+ LLAMA_LOG_INFO (" %s: %10s output buffer size = %8.2f MiB\n " , __func__,
223
+ ggml_backend_buffer_name (buf_output.get ()),
224
+ ggml_backend_buffer_get_size (buf_output.get ()) / 1024.0 / 1024.0 );
225
+ }
226
+
227
+ // scheduler and compute buffers
228
+ {
229
+ // buffer types used for the compute buffer of each backend
230
+ std::vector<ggml_backend_buffer_type_t > backend_buft;
231
+ std::vector<ggml_backend_t > backend_ptrs;
232
+ for (auto & backend : backends) {
233
+ auto * buft = ggml_backend_get_default_buffer_type (backend.get ());
234
+ auto backend_type = ggml_backend_dev_type (ggml_backend_get_device (backend.get ()));
235
+ if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices .empty ()) {
236
+ // use the host buffer of the first device CPU for faster transfer of the intermediate state
237
+ auto * dev = model.devices [0 ];
238
+ auto * host_buft = ggml_backend_dev_host_buffer_type (dev);
239
+ if (host_buft) {
240
+ buft = host_buft;
241
+ }
242
+ }
243
+ backend_buft.push_back (buft);
244
+ backend_ptrs.push_back (backend.get ());
245
+ }
246
+
247
+ const size_t max_nodes = model.max_nodes ();
248
+
249
+ // buffer used to store the computation graph and the tensor meta data
250
+ buf_compute_meta.resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
251
+
252
+ // TODO: move these checks to ggml_backend_sched
253
+ // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
254
+ bool pipeline_parallel =
255
+ model.n_devices () > 1 &&
256
+ model.params .n_gpu_layers > (int ) model.hparams .n_layer &&
257
+ model.params .split_mode == LLAMA_SPLIT_MODE_LAYER &&
258
+ params.offload_kqv ;
259
+
260
+ // pipeline parallelism requires support for async compute and events in all devices
261
+ if (pipeline_parallel) {
262
+ for (auto & backend : backends) {
263
+ auto dev_type = ggml_backend_dev_type (ggml_backend_get_device (backend.get ()));
264
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
265
+ // ignore CPU backend
266
+ continue ;
267
+ }
268
+ auto * dev = ggml_backend_get_device (backend.get ());
269
+ ggml_backend_dev_props props;
270
+ ggml_backend_dev_get_props (dev, &props);
271
+ if (!props.caps .async || !props.caps .events ) {
272
+ // device does not support async compute or events
273
+ pipeline_parallel = false ;
274
+ break ;
275
+ }
276
+ }
277
+ }
278
+
279
+ sched.reset (ggml_backend_sched_new (backend_ptrs.data (), backend_buft.data (), backend_ptrs.size (), max_nodes, pipeline_parallel));
280
+
281
+ if (pipeline_parallel) {
282
+ LLAMA_LOG_INFO (" %s: pipeline parallelism enabled (n_copies=%d)\n " , __func__, ggml_backend_sched_get_n_copies (sched.get ()));
283
+ }
284
+
285
+ // initialize scheduler with the worst-case graph
286
+ uint32_t n_seqs = 1 ; // TODO: worst-case number of sequences
287
+ uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
288
+ llama_token token = model.vocab .token_bos (); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
289
+
290
+ llama_ubatch ubatch_pp = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
291
+ ggml_cgraph * gf_pp = fn_build_graph_worst (*this , ubatch_pp);
292
+
293
+ // reserve pp graph first so that buffers are only allocated once
294
+ ggml_backend_sched_reserve (sched.get (), gf_pp);
295
+ int n_splits_pp = ggml_backend_sched_get_n_splits (sched.get ());
296
+ int n_nodes_pp = ggml_graph_n_nodes (gf_pp);
297
+
298
+ // reserve with tg graph to get the number of splits and nodes
299
+ llama_ubatch ubatch_tg = { true , 1 , 1 , n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
300
+ ggml_cgraph * gf_tg = fn_build_graph_worst (*this , ubatch_tg);
301
+ ggml_backend_sched_reserve (sched.get (), gf_tg);
302
+ int n_splits_tg = ggml_backend_sched_get_n_splits (sched.get ());
303
+ int n_nodes_tg = ggml_graph_n_nodes (gf_tg);
304
+
305
+ // reserve again with pp graph to avoid ggml-alloc reallocations during inference
306
+ gf_pp = fn_build_graph_worst (*this , ubatch_pp);
307
+ if (!ggml_backend_sched_reserve (sched.get (), gf_pp)) {
308
+ LLAMA_LOG_ERROR (" %s: failed to allocate compute buffers\n " , __func__);
309
+ throw std::runtime_error (" failed to allocate compute buffers" );
310
+ }
311
+
312
+ for (size_t i = 0 ; i < backend_ptrs.size (); ++i) {
313
+ ggml_backend_t backend = backend_ptrs[i];
314
+ ggml_backend_buffer_type_t buft = backend_buft[i];
315
+ size_t size = ggml_backend_sched_get_buffer_size (sched.get (), backend);
316
+ if (size > 1 ) {
317
+ LLAMA_LOG_INFO (" %s: %10s compute buffer size = %8.2f MiB\n " , __func__,
318
+ ggml_backend_buft_name (buft),
319
+ size / 1024.0 / 1024.0 );
320
+ }
321
+ }
322
+
323
+ if (n_nodes_pp == n_nodes_tg) {
324
+ LLAMA_LOG_INFO (" %s: graph nodes = %d\n " , __func__, n_nodes_pp);
325
+ } else {
326
+ LLAMA_LOG_INFO (" %s: graph nodes = %d (with bs=%d), %d (with bs=1)\n " , __func__, n_nodes_pp, n_tokens, n_nodes_tg);
327
+ }
328
+ if (n_splits_pp == n_splits_tg) {
329
+ LLAMA_LOG_INFO (" %s: graph splits = %d\n " , __func__, n_splits_pp);
330
+ } else {
331
+ LLAMA_LOG_INFO (" %s: graph splits = %d (with bs=%d), %d (with bs=1)\n " , __func__, n_splits_pp, n_tokens, n_splits_tg);
332
+ }
333
+ }
334
+ }
335
+
336
+ }
337
+
35
338
struct llama_batch_manager : public llama_batch_manager_i {
36
339
llama_batch_manager (llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
37
340
const auto & hparams = lctx.model .hparams ;
@@ -81,7 +384,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
81
384
82
385
// non-causal masks do not use the KV cache
83
386
if (hparams.causal_attn ) {
84
- llama_kv_self_update (& lctx);
387
+ lctx. kv_self_update ( );
85
388
86
389
// if we have enough unused cells before the current head ->
87
390
// better to start searching from the beginning of the cache, hoping to fill it
@@ -106,6 +409,8 @@ struct llama_batch_manager : public llama_batch_manager_i {
106
409
}
107
410
}
108
411
412
+ // printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
413
+
109
414
return true ;
110
415
}
111
416
0 commit comments