Skip to content

Commit be8f568

Browse files
committed
llama : adapter
ggml-ci
1 parent cdeebe5 commit be8f568

File tree

7 files changed

+382
-329
lines changed

7 files changed

+382
-329
lines changed

common/common.h

+10
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,17 @@ std::string fs_get_cache_file(const std::string & filename);
481481
struct common_init_result {
482482
struct llama_model * model = nullptr;
483483
struct llama_context * context = nullptr;
484+
484485
std::vector<common_lora_adapter_container> lora_adapters;
486+
487+
~common_init_result() {
488+
llama_free(context);
489+
llama_free_model(model);
490+
491+
for (auto & lora_adapter : lora_adapters) {
492+
llama_lora_adapter_free(lora_adapter.adapter);
493+
}
494+
}
485495
};
486496

487497
struct common_init_result common_init_from_params(common_params & params);

include/llama.h

+2-3
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@ extern "C" {
417417
const char * path_model,
418418
struct llama_model_params params);
419419

420+
// TODO: rename to llama_model_free
420421
LLAMA_API void llama_free_model(struct llama_model * model);
421422

422423
// TODO: rename to llama_init_from_model
@@ -507,7 +508,6 @@ extern "C" {
507508
//
508509

509510
// Load a LoRA adapter from file
510-
// The loaded adapter will be associated to the given model, and will be free when the model is deleted
511511
// TODO: rename to llama_adapter_lora_init
512512
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
513513
struct llama_model * model,
@@ -530,8 +530,7 @@ extern "C" {
530530

531531
// Remove all LoRA adapters from given context
532532
// TODO: rename to llama_clear_adapter_lora
533-
LLAMA_API void llama_lora_adapter_clear(
534-
struct llama_context * ctx);
533+
LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
535534

536535
// Manually free a LoRA adapter
537536
// Note: loaded adapters will be free when the associated model is deleted

src/llama-adapter.cpp

+313
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,318 @@
11
#include "llama-adapter.h"
22

3+
#include "llama-model.h"
4+
5+
#include <algorithm>
6+
#include <map>
7+
#include <cassert>
8+
9+
// vec
10+
11+
struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
12+
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
13+
return nullptr;
14+
}
15+
16+
return tensors[il];
17+
}
18+
19+
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
20+
ggml_tensor * layer_dir = tensor_for(il);
21+
if (layer_dir != nullptr) {
22+
cur = ggml_add(ctx, cur, layer_dir);
23+
}
24+
25+
return cur;
26+
}
27+
28+
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
29+
const auto & hparams = model.hparams;
30+
31+
GGML_ASSERT(cvec.tensors.empty());
32+
GGML_ASSERT(cvec.ctxs.empty());
33+
GGML_ASSERT(cvec.bufs.empty());
34+
35+
// create a context for each buffer type
36+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
37+
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
38+
auto it = ctx_map.find(buft);
39+
if (it == ctx_map.end()) {
40+
struct ggml_init_params params = {
41+
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
42+
/*.mem_buffer =*/ NULL,
43+
/*.no_alloc =*/ true,
44+
};
45+
46+
ggml_context * ctx = ggml_init(params);
47+
if (!ctx) {
48+
return nullptr;
49+
}
50+
51+
ctx_map[buft] = ctx;
52+
cvec.ctxs.emplace_back(ctx);
53+
54+
return ctx;
55+
}
56+
57+
return it->second;
58+
};
59+
60+
// make tensors
61+
cvec.tensors.reserve(hparams.n_layer);
62+
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
63+
for (size_t il = 1; il < hparams.n_layer; il++) {
64+
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
65+
ggml_context * ctx = ctx_for_buft(buft);
66+
if (!ctx) {
67+
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
68+
return false;
69+
}
70+
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
71+
cvec.tensors.push_back(tensor);
72+
}
73+
74+
// allocate tensors / buffers and zero
75+
cvec.bufs.reserve(ctx_map.size());
76+
for (auto it : ctx_map) {
77+
ggml_backend_buffer_type_t buft = it.first;
78+
ggml_context * ctx = it.second;
79+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
80+
if (!buf) {
81+
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
82+
return false;
83+
}
84+
ggml_backend_buffer_clear(buf, 0);
85+
cvec.bufs.emplace_back(buf);
86+
}
87+
88+
return true;
89+
}
90+
91+
int32_t llama_control_vector_apply(
92+
struct llama_control_vector & cvec,
93+
const llama_model & model,
94+
const float * data,
95+
size_t len,
96+
int32_t n_embd,
97+
int32_t il_start,
98+
int32_t il_end) {
99+
const auto & hparams = model.hparams;
100+
101+
if (data == nullptr) {
102+
// disable the current control vector (but leave allocated for later)
103+
cvec.layer_start = -1;
104+
cvec.layer_end = -1;
105+
return 0;
106+
}
107+
108+
if (n_embd != (int) hparams.n_embd) {
109+
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
110+
return 1;
111+
}
112+
113+
if (cvec.tensors.empty()) {
114+
if (!llama_control_vector_init(cvec, model)) {
115+
return 1;
116+
}
117+
}
118+
119+
cvec.layer_start = il_start;
120+
cvec.layer_end = il_end;
121+
122+
for (size_t il = 1; il < hparams.n_layer; il++) {
123+
assert(cvec.tensors[il] != nullptr);
124+
125+
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
126+
if (off + n_embd <= len) {
127+
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
128+
}
129+
}
130+
131+
return 0;
132+
}
133+
134+
// lora
135+
136+
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
137+
const std::string name(w->name);
138+
139+
const auto pos = ab_map.find(name);
140+
if (pos != ab_map.end()) {
141+
return &pos->second;
142+
}
143+
144+
return nullptr;
145+
}
146+
3147
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
4148
delete adapter;
5149
}
150+
151+
void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
152+
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
153+
154+
ggml_context * ctx_init;
155+
struct gguf_init_params meta_gguf_params = {
156+
/* .no_alloc = */ true,
157+
/* .ctx = */ &ctx_init,
158+
};
159+
160+
gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
161+
if (!ctx_gguf) {
162+
throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
163+
}
164+
165+
ggml_context_ptr ctx { ctx_init };
166+
167+
// check metadata
168+
{
169+
auto get_kv_str = [&](const std::string & key) -> std::string {
170+
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
171+
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
172+
};
173+
auto get_kv_f32 = [&](const std::string & key) -> float {
174+
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
175+
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
176+
};
177+
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
178+
179+
auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
180+
if (general_type != "adapter") {
181+
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
182+
}
183+
184+
auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
185+
auto general_arch = llm_arch_from_string(general_arch_str);
186+
if (general_arch != model.arch) {
187+
throw std::runtime_error("model arch and LoRA arch mismatch");
188+
}
189+
190+
auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
191+
if (adapter_type != "lora") {
192+
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
193+
}
194+
195+
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
196+
}
197+
198+
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
199+
200+
// contexts for each buffer type
201+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
202+
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
203+
auto it = ctx_map.find(buft);
204+
if (it == ctx_map.end()) {
205+
// add a new context
206+
struct ggml_init_params params = {
207+
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
208+
/*.mem_buffer =*/ NULL,
209+
/*.no_alloc =*/ true,
210+
};
211+
ggml_context * buft_ctx = ggml_init(params);
212+
if (!buft_ctx) {
213+
return nullptr;
214+
}
215+
ctx_map[buft] = buft_ctx;
216+
adapter.ctxs.emplace_back(buft_ctx);
217+
return buft_ctx;
218+
};
219+
return it->second;
220+
};
221+
222+
// bundle lora_a and lora_b into pairs
223+
std::map<std::string, llama_lora_weight> ab_map;
224+
auto str_endswith = [](const std::string & str, const std::string & suffix) {
225+
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
226+
};
227+
228+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
229+
std::string name(cur->name);
230+
if (str_endswith(name, ".lora_a")) {
231+
replace_all(name, ".lora_a", "");
232+
if (ab_map.find(name) == ab_map.end()) {
233+
ab_map[name] = llama_lora_weight(cur, nullptr);
234+
} else {
235+
ab_map[name].a = cur;
236+
}
237+
} else if (str_endswith(name, ".lora_b")) {
238+
replace_all(name, ".lora_b", "");
239+
if (ab_map.find(name) == ab_map.end()) {
240+
ab_map[name] = llama_lora_weight(nullptr, cur);
241+
} else {
242+
ab_map[name].b = cur;
243+
}
244+
} else {
245+
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
246+
}
247+
}
248+
249+
// add tensors
250+
for (auto & it : ab_map) {
251+
const std::string & name = it.first;
252+
llama_lora_weight & w = it.second;
253+
254+
if (!w.a || !w.b) {
255+
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
256+
}
257+
258+
// device buft and device ctx
259+
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
260+
if (!model_tensor) {
261+
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
262+
}
263+
264+
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
265+
// validate tensor shape
266+
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
267+
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
268+
}
269+
if (w.a->ne[1] != w.b->ne[0]) {
270+
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
271+
}
272+
273+
// save tensor to adapter
274+
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
275+
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
276+
ggml_set_name(tensor_a, w.a->name);
277+
ggml_set_name(tensor_b, w.b->name);
278+
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
279+
}
280+
281+
// allocate tensors / buffers and zero
282+
{
283+
adapter.ctxs.reserve(ctx_map.size());
284+
adapter.bufs.reserve(ctx_map.size());
285+
for (auto & it : ctx_map) {
286+
ggml_backend_buffer_type_t buft = it.first;
287+
ggml_context * ctx_dev = it.second;
288+
ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
289+
if (!buf) {
290+
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
291+
}
292+
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
293+
adapter.bufs.emplace_back(std::move(buf));
294+
}
295+
}
296+
297+
// set tensor data
298+
{
299+
llama_file gguf_file(path_lora, "rb");
300+
std::vector<uint8_t> read_buf;
301+
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
302+
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
303+
size_t size = ggml_nbytes(orig);
304+
read_buf.resize(size);
305+
gguf_file.seek(offs, SEEK_SET);
306+
gguf_file.read_raw(read_buf.data(), size);
307+
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
308+
};
309+
for (auto & it : adapter.ab_map) {
310+
auto orig = ab_map[it.first];
311+
auto dev = it.second;
312+
set_tensor(orig.a, dev.a);
313+
set_tensor(orig.b, dev.b);
314+
}
315+
}
316+
317+
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
318+
}

0 commit comments

Comments
 (0)