Skip to content

Add multiple derived adaptions hosting #8415

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.hf_file = argv[i];
return true;
}
if (arg == "-mpa" || arg == "--model-path-alias") {
CHECK_ARG
std::string model_derived_alias = argv[i];
size_t equals_pos = model_derived_alias.find('=');
if (equals_pos != std::string::npos) {
std::string alias = model_derived_alias.substr(0, equals_pos);
std::string model_path = model_derived_alias.substr(equals_pos + 1);
params.derived_model_paths.emplace_back(alias, model_path);
}

return true;
}
if (arg == "--lora") {
CHECK_ARG
params.lora_adapter.emplace_back(argv[i], 1.0f);
Expand Down Expand Up @@ -2045,6 +2057,21 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
}
}

for (unsigned int i = 0; i < params.derived_model_paths.size(); ++i) {
const auto & derived_model_path = params.derived_model_paths[i];
const std::string & derived_model_name = std::get<0>(derived_model_path);
const std::string & derived_model_file = std::get<1>(derived_model_path);

llama_model * derived_model = llama_load_model_from_file(derived_model_file.c_str(), mparams);

if (derived_model == NULL) {
fprintf(stderr, "%s: error: failed to load derived model '%s'\n", __func__, derived_model_file.c_str());
}

llama_model_set_name(derived_model, derived_model_name.c_str());
llama_ctx_set_derived_model(lctx, derived_model);
}

for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
Expand Down
3 changes: 3 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ struct gpt_params {
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;

// multiple derived models paths map
std::vector<std::tuple<std::string, std::string>> derived_model_paths; // derived model paths

// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ else()
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(main)
add_subdirectory(multi-adaptation)
add_subdirectory(parallel)
add_subdirectory(passkey)
add_subdirectory(perplexity)
Expand Down
89 changes: 76 additions & 13 deletions examples/gguf-split/gguf-split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ struct split_params {
int n_split_tensors = 128;
std::string input;
std::string output;
std::string tensor_set_file;
bool no_tensor_first_split = false;
bool dry_run = false;
bool customized_split = false;
};

static void split_print_usage(const char * executable) {
Expand All @@ -47,6 +49,7 @@ static void split_print_usage(const char * executable) {
printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n");
printf(" --split split GGUF to multiple GGUF (enabled by default)\n");
printf(" --tensor-set customize tensor set used to split. File contains modules, e.g. 'ffn_up.weight'");
printf(" --merge merge multiple GGUF to a single GGUF\n");
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
printf(" --split-max-size N(M|G) max size per split\n");
Expand Down Expand Up @@ -121,6 +124,16 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
params.operation = SPLIT_OP_SPLIT;
}

if (arg == "--tensor-set") {
arg_found = true;
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
params.tensor_set_file = argv[arg_idx];
params.customized_split = true;
}

if (is_mode_set) {
throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
}
Expand Down Expand Up @@ -180,12 +193,27 @@ static void zeros(std::ofstream & file, size_t n) {
}
}

static std::vector<std::string> read_customized_tensors(const std::string & tensor_set_file) {
std::vector<std::string> tensor_set;
std::ifstream f_tensor_set(tensor_set_file);
if (!f_tensor_set.is_open()) {
fprintf(stderr, "error: failed to open tensor set file %s\n", tensor_set_file.c_str());
exit(EXIT_FAILURE);
}
std::string line;
while (std::getline(f_tensor_set, line)) {
tensor_set.push_back(line);
}
return tensor_set;
}

struct split_strategy {
const split_params params;
std::ifstream & f_input;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_meta = NULL;
const int n_tensors;
std::string tensor_set_file;

// one ctx_out per one output file
std::vector<struct gguf_context *> ctx_outs;
Expand Down Expand Up @@ -233,20 +261,45 @@ struct split_strategy {
new_ctx_out(true);
}

// process tensors one by one
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
for (int i = 0; i < n_tensors; ++i) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
// calculate the "imaginary" size = the current size + next tensor size
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
size_t next_tensors_size = curr_tensors_size + n_bytes;
if (should_split(i, next_tensors_size)) {
new_ctx_out(false);
curr_tensors_size = n_bytes;
} else {
curr_tensors_size = next_tensors_size;
if (!params.customized_split) {
// process tensors one by one
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
for (int i = 0; i < n_tensors; ++i) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
// calculate the "imaginary" size = the current size + next tensor size
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
size_t next_tensors_size = curr_tensors_size + n_bytes;
if (should_split(i, next_tensors_size)) {
new_ctx_out(false);
curr_tensors_size = n_bytes;
} else {
curr_tensors_size = next_tensors_size;
}
gguf_add_tensor(ctx_out, t);
}
} else {
// custom split based on tensor set
std::vector<std::string> tensor_set = read_customized_tensors(params.tensor_set_file);
if(tensor_set.empty()) {
fprintf(stderr, "error: tensor set is empty\n");
exit(EXIT_FAILURE);
}
for (int i = 0; i < n_tensors; ++i) {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i);
if (is_tensor_in_customized_set(t_name, tensor_set)) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
gguf_add_tensor(ctx_out, t);
}
}
new_ctx_out(false);
// add left tensors to the next split
for (int i = 0; i < n_tensors; ++i) {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i);
if (!is_tensor_in_customized_set(t_name, tensor_set)) {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
gguf_add_tensor(ctx_out, t);
}
}
gguf_add_tensor(ctx_out, t);
}

// push the last ctx_out
Expand Down Expand Up @@ -274,6 +327,16 @@ struct split_strategy {
}
}

bool is_tensor_in_customized_set(const char * t_name, std::vector<std::string> tensor_set) {
for (auto & s : tensor_set) {
if (strstr(t_name, s.c_str()) != NULL) {
return true;
}
}

return false;
}

void print_info() {
printf("n_split: %ld\n", ctx_outs.size());
int i_split = 0;
Expand Down
5 changes: 5 additions & 0 deletions examples/multi-adaptation/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET llama_multi-adaptation)
add_executable(${TARGET} multi-adaptation.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
33 changes: 33 additions & 0 deletions examples/multi-adaptation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Server Multi Adaptations for Different Scenarios

## Goal
Service multiple scenarios on memory-constrained devices. The GGUF models are stored in the same folder.

## Usage
Use the `-mpa` parameter to pass the alias and model path.

### Flag to Switch Derived Model
```c
llama_ctx_switch_derived_model(ctx, "summarize");
```

### Pass Model Path and Alias for Derived Models
```sh
llama_multi-adaptation.exe -m models\Phi-3-mini-4k-instruct-adaptor-base.gguf \
-mpa code_writer=models\Phi-3-mini-4k-instruct-adaptor-code_writer.gguf \
-mpa summarize=models\Phi-3-mini-4k-instruct-adaptor-summarization.gguf
```

## Foundation Model
The **foundation** GGUF contains the weights shared across models.
The **adaptor** GGUF contains the task-specific weights.

Here are the combinations for hosting three models:
- `model-adaptor-base.gguf (0.77GB) + model-foundation.gguf (1.56GB)`
- `model-adaptor-taskA.gguf + model-foundation.gguf`
- `model-adaptor-taskB.gguf + model-foundation.gguf`

The benefit is that it supports hosting multiple scenarios while keeping only one copy of the shared weights in memory. With the benefit of `mmap`, the task-specific GGUF is only loaded when the corresponding task is called.

## Example
Use the GGUF splits in this model repository: [Phi-3-mini-4k-instruct_multi-adaptor_gguf](https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf)
Loading