|
1 | 1 | #include "llama-adapter.h"
|
2 | 2 |
|
| 3 | +#include "llama-model.h" |
| 4 | + |
| 5 | +#include <algorithm> |
| 6 | +#include <map> |
| 7 | +#include <cassert> |
| 8 | + |
| 9 | +// vec |
| 10 | + |
| 11 | +struct ggml_tensor * llama_control_vector::tensor_for(int il) const { |
| 12 | + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { |
| 13 | + return nullptr; |
| 14 | + } |
| 15 | + |
| 16 | + return tensors[il]; |
| 17 | +} |
| 18 | + |
| 19 | +struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { |
| 20 | + ggml_tensor * layer_dir = tensor_for(il); |
| 21 | + if (layer_dir != nullptr) { |
| 22 | + cur = ggml_add(ctx, cur, layer_dir); |
| 23 | + } |
| 24 | + |
| 25 | + return cur; |
| 26 | +} |
| 27 | + |
| 28 | +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { |
| 29 | + const auto & hparams = model.hparams; |
| 30 | + |
| 31 | + GGML_ASSERT(cvec.tensors.empty()); |
| 32 | + GGML_ASSERT(cvec.ctxs.empty()); |
| 33 | + GGML_ASSERT(cvec.bufs.empty()); |
| 34 | + |
| 35 | + // create a context for each buffer type |
| 36 | + std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; |
| 37 | + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { |
| 38 | + auto it = ctx_map.find(buft); |
| 39 | + if (it == ctx_map.end()) { |
| 40 | + struct ggml_init_params params = { |
| 41 | + /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), |
| 42 | + /*.mem_buffer =*/ NULL, |
| 43 | + /*.no_alloc =*/ true, |
| 44 | + }; |
| 45 | + |
| 46 | + ggml_context * ctx = ggml_init(params); |
| 47 | + if (!ctx) { |
| 48 | + return nullptr; |
| 49 | + } |
| 50 | + |
| 51 | + ctx_map[buft] = ctx; |
| 52 | + cvec.ctxs.emplace_back(ctx); |
| 53 | + |
| 54 | + return ctx; |
| 55 | + } |
| 56 | + |
| 57 | + return it->second; |
| 58 | + }; |
| 59 | + |
| 60 | + // make tensors |
| 61 | + cvec.tensors.reserve(hparams.n_layer); |
| 62 | + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 |
| 63 | + for (size_t il = 1; il < hparams.n_layer; il++) { |
| 64 | + ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il); |
| 65 | + ggml_context * ctx = ctx_for_buft(buft); |
| 66 | + if (!ctx) { |
| 67 | + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); |
| 68 | + return false; |
| 69 | + } |
| 70 | + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); |
| 71 | + cvec.tensors.push_back(tensor); |
| 72 | + } |
| 73 | + |
| 74 | + // allocate tensors / buffers and zero |
| 75 | + cvec.bufs.reserve(ctx_map.size()); |
| 76 | + for (auto it : ctx_map) { |
| 77 | + ggml_backend_buffer_type_t buft = it.first; |
| 78 | + ggml_context * ctx = it.second; |
| 79 | + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); |
| 80 | + if (!buf) { |
| 81 | + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); |
| 82 | + return false; |
| 83 | + } |
| 84 | + ggml_backend_buffer_clear(buf, 0); |
| 85 | + cvec.bufs.emplace_back(buf); |
| 86 | + } |
| 87 | + |
| 88 | + return true; |
| 89 | +} |
| 90 | + |
| 91 | +int32_t llama_control_vector_apply( |
| 92 | + struct llama_control_vector & cvec, |
| 93 | + const llama_model & model, |
| 94 | + const float * data, |
| 95 | + size_t len, |
| 96 | + int32_t n_embd, |
| 97 | + int32_t il_start, |
| 98 | + int32_t il_end) { |
| 99 | + const auto & hparams = model.hparams; |
| 100 | + |
| 101 | + if (data == nullptr) { |
| 102 | + // disable the current control vector (but leave allocated for later) |
| 103 | + cvec.layer_start = -1; |
| 104 | + cvec.layer_end = -1; |
| 105 | + return 0; |
| 106 | + } |
| 107 | + |
| 108 | + if (n_embd != (int) hparams.n_embd) { |
| 109 | + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); |
| 110 | + return 1; |
| 111 | + } |
| 112 | + |
| 113 | + if (cvec.tensors.empty()) { |
| 114 | + if (!llama_control_vector_init(cvec, model)) { |
| 115 | + return 1; |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + cvec.layer_start = il_start; |
| 120 | + cvec.layer_end = il_end; |
| 121 | + |
| 122 | + for (size_t il = 1; il < hparams.n_layer; il++) { |
| 123 | + assert(cvec.tensors[il] != nullptr); |
| 124 | + |
| 125 | + const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present |
| 126 | + if (off + n_embd <= len) { |
| 127 | + ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); |
| 128 | + } |
| 129 | + } |
| 130 | + |
| 131 | + return 0; |
| 132 | +} |
| 133 | + |
| 134 | +// lora |
| 135 | + |
| 136 | +llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) { |
| 137 | + const std::string name(w->name); |
| 138 | + |
| 139 | + const auto pos = ab_map.find(name); |
| 140 | + if (pos != ab_map.end()) { |
| 141 | + return &pos->second; |
| 142 | + } |
| 143 | + |
| 144 | + return nullptr; |
| 145 | +} |
| 146 | + |
3 | 147 | void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
4 | 148 | delete adapter;
|
5 | 149 | }
|
| 150 | + |
| 151 | +void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { |
| 152 | + LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); |
| 153 | + |
| 154 | + ggml_context * ctx_init; |
| 155 | + struct gguf_init_params meta_gguf_params = { |
| 156 | + /* .no_alloc = */ true, |
| 157 | + /* .ctx = */ &ctx_init, |
| 158 | + }; |
| 159 | + |
| 160 | + gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; |
| 161 | + if (!ctx_gguf) { |
| 162 | + throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); |
| 163 | + } |
| 164 | + |
| 165 | + ggml_context_ptr ctx { ctx_init }; |
| 166 | + |
| 167 | + // check metadata |
| 168 | + { |
| 169 | + auto get_kv_str = [&](const std::string & key) -> std::string { |
| 170 | + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); |
| 171 | + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); |
| 172 | + }; |
| 173 | + auto get_kv_f32 = [&](const std::string & key) -> float { |
| 174 | + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); |
| 175 | + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); |
| 176 | + }; |
| 177 | + LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); |
| 178 | + |
| 179 | + auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); |
| 180 | + if (general_type != "adapter") { |
| 181 | + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); |
| 182 | + } |
| 183 | + |
| 184 | + auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); |
| 185 | + auto general_arch = llm_arch_from_string(general_arch_str); |
| 186 | + if (general_arch != model.arch) { |
| 187 | + throw std::runtime_error("model arch and LoRA arch mismatch"); |
| 188 | + } |
| 189 | + |
| 190 | + auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); |
| 191 | + if (adapter_type != "lora") { |
| 192 | + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); |
| 193 | + } |
| 194 | + |
| 195 | + adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); |
| 196 | + } |
| 197 | + |
| 198 | + int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); |
| 199 | + |
| 200 | + // contexts for each buffer type |
| 201 | + std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; |
| 202 | + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { |
| 203 | + auto it = ctx_map.find(buft); |
| 204 | + if (it == ctx_map.end()) { |
| 205 | + // add a new context |
| 206 | + struct ggml_init_params params = { |
| 207 | + /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), |
| 208 | + /*.mem_buffer =*/ NULL, |
| 209 | + /*.no_alloc =*/ true, |
| 210 | + }; |
| 211 | + ggml_context * buft_ctx = ggml_init(params); |
| 212 | + if (!buft_ctx) { |
| 213 | + return nullptr; |
| 214 | + } |
| 215 | + ctx_map[buft] = buft_ctx; |
| 216 | + adapter.ctxs.emplace_back(buft_ctx); |
| 217 | + return buft_ctx; |
| 218 | + }; |
| 219 | + return it->second; |
| 220 | + }; |
| 221 | + |
| 222 | + // bundle lora_a and lora_b into pairs |
| 223 | + std::map<std::string, llama_lora_weight> ab_map; |
| 224 | + auto str_endswith = [](const std::string & str, const std::string & suffix) { |
| 225 | + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; |
| 226 | + }; |
| 227 | + |
| 228 | + for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { |
| 229 | + std::string name(cur->name); |
| 230 | + if (str_endswith(name, ".lora_a")) { |
| 231 | + replace_all(name, ".lora_a", ""); |
| 232 | + if (ab_map.find(name) == ab_map.end()) { |
| 233 | + ab_map[name] = llama_lora_weight(cur, nullptr); |
| 234 | + } else { |
| 235 | + ab_map[name].a = cur; |
| 236 | + } |
| 237 | + } else if (str_endswith(name, ".lora_b")) { |
| 238 | + replace_all(name, ".lora_b", ""); |
| 239 | + if (ab_map.find(name) == ab_map.end()) { |
| 240 | + ab_map[name] = llama_lora_weight(nullptr, cur); |
| 241 | + } else { |
| 242 | + ab_map[name].b = cur; |
| 243 | + } |
| 244 | + } else { |
| 245 | + throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); |
| 246 | + } |
| 247 | + } |
| 248 | + |
| 249 | + // add tensors |
| 250 | + for (auto & it : ab_map) { |
| 251 | + const std::string & name = it.first; |
| 252 | + llama_lora_weight & w = it.second; |
| 253 | + |
| 254 | + if (!w.a || !w.b) { |
| 255 | + throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); |
| 256 | + } |
| 257 | + |
| 258 | + // device buft and device ctx |
| 259 | + auto * model_tensor = llama_model_get_tensor(model, name.c_str()); |
| 260 | + if (!model_tensor) { |
| 261 | + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); |
| 262 | + } |
| 263 | + |
| 264 | + struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); |
| 265 | + // validate tensor shape |
| 266 | + if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { |
| 267 | + throw std::runtime_error("tensor '" + name + "' has incorrect shape"); |
| 268 | + } |
| 269 | + if (w.a->ne[1] != w.b->ne[0]) { |
| 270 | + throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); |
| 271 | + } |
| 272 | + |
| 273 | + // save tensor to adapter |
| 274 | + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); |
| 275 | + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); |
| 276 | + ggml_set_name(tensor_a, w.a->name); |
| 277 | + ggml_set_name(tensor_b, w.b->name); |
| 278 | + adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); |
| 279 | + } |
| 280 | + |
| 281 | + // allocate tensors / buffers and zero |
| 282 | + { |
| 283 | + adapter.ctxs.reserve(ctx_map.size()); |
| 284 | + adapter.bufs.reserve(ctx_map.size()); |
| 285 | + for (auto & it : ctx_map) { |
| 286 | + ggml_backend_buffer_type_t buft = it.first; |
| 287 | + ggml_context * ctx_dev = it.second; |
| 288 | + ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; |
| 289 | + if (!buf) { |
| 290 | + throw std::runtime_error("failed to allocate buffer for lora adapter\n"); |
| 291 | + } |
| 292 | + LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); |
| 293 | + adapter.bufs.emplace_back(std::move(buf)); |
| 294 | + } |
| 295 | + } |
| 296 | + |
| 297 | + // set tensor data |
| 298 | + { |
| 299 | + llama_file gguf_file(path_lora, "rb"); |
| 300 | + std::vector<uint8_t> read_buf; |
| 301 | + auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { |
| 302 | + size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); |
| 303 | + size_t size = ggml_nbytes(orig); |
| 304 | + read_buf.resize(size); |
| 305 | + gguf_file.seek(offs, SEEK_SET); |
| 306 | + gguf_file.read_raw(read_buf.data(), size); |
| 307 | + ggml_backend_tensor_set(dev, read_buf.data(), 0, size); |
| 308 | + }; |
| 309 | + for (auto & it : adapter.ab_map) { |
| 310 | + auto orig = ab_map[it.first]; |
| 311 | + auto dev = it.second; |
| 312 | + set_tensor(orig.a, dev.a); |
| 313 | + set_tensor(orig.b, dev.b); |
| 314 | + } |
| 315 | + } |
| 316 | + |
| 317 | + LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); |
| 318 | +} |
0 commit comments