diff --git a/.gitignore b/.gitignore index 420e0d6d016a2..f98acd707b422 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ models-mnt /result /save-load-state /server +/server-parallel /simple /batched /export-lora diff --git a/Makefile b/Makefile index 87e7bb604c0c8..41b5a63875bab 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server server-parallel embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe @@ -520,7 +520,7 @@ OBJS += ggml-alloc.o ggml-backend.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common.o: common/common.cpp common/common.h build-info.h common/log.h +common.o: common/common.cpp common/common.h build-info.h common/log.h common/httplib.h common/json.hpp $(CXX) $(CXXFLAGS) -c $< -o $@ console.o: common/console.cpp common/console.h @@ -572,9 +572,12 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml. save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) +server-parallel: examples/server-parallel/server.cpp examples/server-parallel/frontend.h build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) -Iexamples/server-parallel $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) + $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 951aa8340c7e4..ac7189cd2bc30 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -11,6 +11,8 @@ add_library(${TARGET} OBJECT grammar-parser.cpp train.h train.cpp + json.hpp + httplib.h ) if (BUILD_SHARED_LIBS) diff --git a/examples/server/httplib.h b/common/httplib.h similarity index 100% rename from examples/server/httplib.h rename to common/httplib.h diff --git a/examples/server/json.hpp b/common/json.hpp similarity index 100% rename from examples/server/json.hpp rename to common/json.hpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index de4cf7a691768..4741b7dec20f4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -35,6 +35,7 @@ else() endif() if (LLAMA_BUILD_SERVER) add_subdirectory(server) + add_subdirectory(server-parallel) endif() add_subdirectory(export-lora) endif() diff --git a/examples/server-parallel/CMakeLists.txt b/examples/server-parallel/CMakeLists.txt new file mode 100644 index 0000000000000..4df8b462fea01 --- /dev/null +++ b/examples/server-parallel/CMakeLists.txt @@ -0,0 +1,15 @@ +set(TARGET server-parallel) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +add_executable(${TARGET} server.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_compile_definitions(${TARGET} PRIVATE + SERVER_VERBOSE=$ +) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +if (WIN32) + TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) +endif() +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/server-parallel/README.md b/examples/server-parallel/README.md new file mode 100644 index 0000000000000..0244917597d6e --- /dev/null +++ b/examples/server-parallel/README.md @@ -0,0 +1,91 @@ +# llama.cpp/example/server-parallel + +This example demonstrates a PoC HTTP API server that handles simulataneus requests. Long prompts are not supported. + +Command line options: + +- `--threads N`, `-t N`: Set the number of threads to use during generation. +- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. +- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). +- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. +- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. +- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. +- `--numa`: Attempt optimizations that help on some NUMA systems. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. +- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. +- `--port`: Set the port to listen. Default: `8080`. +- `--path`: path from which to serve static files (default examples/server/public) +- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1) +- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) +- `-r ANTI_PROMPT`, `--reverse-prompt ANTI_PROMPT`: Set a anti prompt, used as user name in prompt generation + +## Quick Start + +To get started right away, run the following command, making sure to use the correct path for the model you have: + +### Unix-based systems (Linux, macOS, etc.): + +```bash +./server-parallel -m models/7B/ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching +``` + +### Windows: + +```powershell +server-parallel.exe -m models\7B\ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching +``` +The above command will start a server that by default listens on `127.0.0.1:8080`. + +## API Endpoints + +- **GET** `/props`: Return the user and assistant name for generate the prompt. + +*Response:* +```json +{ + "user_name": "User:", + "assistant_name": "Assistant:" +} +``` + +- **POST** `/completion`: Given a prompt, it returns the predicted completion, just streaming mode. + + *Options:* + + `temperature`: Adjust the randomness of the generated text (default: 0.1). + + `prompt`: Provide a prompt as a string, It should be a coherent continuation of the system prompt. + + `system_prompt`: Provide a system prompt as a string. + + `anti_prompt`: Provide the name of the user coherent with the system prompt. + + `assistant_name`: Provide the name of the assistant coherent with the system prompt. + +*Example request:* +```json +{ + "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nHuman: Hello\nAssistant: Hi, how may I help you?\nHuman:", + "anti_prompt": "Human:", + "assistant_name": "Assistant:", + "prompt": "When is the day of independency of US?", + "temperature": 0.2 +} +``` + +*Response:* +```json +{ + "content": "" +} +``` + +# This example is a Proof of Concept, have some bugs and unexpected behaivors, this not supports long prompts. diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h new file mode 100644 index 0000000000000..122d58b44556c --- /dev/null +++ b/examples/server-parallel/frontend.h @@ -0,0 +1,298 @@ +const char* system_prompt_default = +R"(Transcript of a never ending dialog, where the User interacts with an Assistant. +The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. +User: Recommend a nice restaurant in the area. +Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays. +User: Who is Richard Feynman? +Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?". +User:)"; + +const char* index_html_ = R"( + + + + llama.cpp - server parallel + + + +
+

Server parallel

+
+ + +
+ +
+ + +
+ + +
+ + +

+ + + +
+
+
+
+ + +)"; + +const char* index_js_ = R"( +let conversation = []; +let current_message = -1; +let canceled = false; +let slot_id = -1; +var controller; + +const questions = [ + "Who is Elon Musk?", + "Who is Jeff Bezos?", + "How to get a job at google?", + "What are you?", + "When was born Abraham Lincoln?", + "What is a black hole?", + "How to prepare for an interview?", + "Will you destroy the humanity?", + "Who is smarter, you or I?", + "What is quantization in ML?" +]; + +let user_name = ""; +let assistant_name = ""; + +function toggle_system_prompt() { + if(document.getElementById("system_promt_cb").checked) { + document.getElementById("system_prompt_view").style.display = "block"; + } else { + document.getElementById("system_prompt_view").style.display = "none"; + } +} + +function clear_sp_props() { + document.getElementById("sp_text").value = ""; + document.getElementById("user_name").value = ""; + document.getElementById("assistant_name").value = ""; +} + +docReady(async () => { + document.getElementById("message").value = + questions[Math.floor(Math.random() * questions.length)]; + // to keep the same prompt format in all clients + const response = await fetch("/props"); + if (!response.ok) { + alert(`HTTP error! Status: ${response.status}`); + } + const data = await response.json(); + user_name = data.user_name; + assistant_name = data.assistant_name; +}); + +function docReady(fn) { + // see if DOM is already available + if ( + document.readyState === "complete" || + document.readyState === "interactive" + ) { + // call on next available tick + setTimeout(fn, 1); + } else { + document.addEventListener("DOMContentLoaded", fn); + } +} + +function updateView() { + let conv_view = document.getElementById("conversation_view"); + // build view + conv_view.innerHTML = ""; + for (let index in conversation) { + conversation[index].assistant = conversation[index].assistant.replace( + user_name, + "" + ); + conv_view.innerHTML += ` +

User: ${conversation[index].user}

+

Assistant: ${conversation[index].assistant}

`; + } +} + +async function call_llama(options) { + try { + controller = new AbortController(); + const response = await fetch("/completion", { + method: "POST", + body: JSON.stringify(options), + headers: { + "Content-Type": "application/json", + Accept: "text/event-stream", + }, + signal: controller.signal + }); + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let leftover = ""; // Buffer for partially read lines + while (current_message >= 0) { + const result = await reader.read(); + if (result.done) { + document.getElementById("btn_send").disabled = false; + document.getElementById("btn_cancel").disabled = true; + break; + } + + // Add any leftover data to the current chunk of data + const text = leftover + decoder.decode(result.value); + + // Check if the last character is a line break + const endsWithLineBreak = text.endsWith("\n"); + + // Split the text into lines + let lines = text.split("\n"); + + // If the text doesn't end with a line break, then the last line is incomplete + // Store it in leftover to be added to the next chunk of data + if (!endsWithLineBreak) { + leftover = lines.pop(); + } else { + leftover = ""; // Reset leftover if we have a line break at the end + } + + // Parse all sse events and add them to result + const regex = /^(\S+):\s(.*)$/gm; + for (const line of lines) { + const match = regex.exec(line); + if (match) { + result[match[1]] = match[2]; + // since we know this is llama.cpp, let's just decode the json in data + if (result.data && current_message >= 0) { + result.data = JSON.parse(result.data); + slot_id = result.data.slot_id; + conversation[current_message].assistant += result.data.content; + updateView(); + } + } + } + } + } catch (e) { + if (e.name !== "AbortError") { + console.error("llama error: ", e); + } + } +} + +function generatePrompt() { + // generate a good prompt to have coherence + let prompt = ""; + for (let index in conversation) { + if (index == 0) { + prompt += conversation[index].user + "\n"; + } else { + prompt += user_name + conversation[index].user + "\n"; + } + if (index == current_message) { + prompt += assistant_name; + } else { + prompt += assistant_name + conversation[index].assistant; + } + } + return prompt; +} + +async function resetView() { + if(controller) { + controller.abort(); + controller = null; + } + document.getElementById("slot_id").value = "-1"; + document.getElementById("temperature").value = "0.1"; + document.getElementById("message").value = + questions[Math.floor(Math.random() * questions.length)]; + document.getElementById("btn_cancel").disabled = true; + document.getElementById("btn_cancel").innerText = "Cancel"; + document.getElementById("btn_send").disabled = false; + document.getElementById("conversation_view").innerHTML = ""; + conversation = []; + current_message = -1; + canceled = false; +} + +async function cancel() { + if(!canceled) { + if(controller) { + controller.abort(); + controller = null; + } + document.getElementById("btn_send").disabled = false; + document.getElementById("btn_cancel").innerText = "Regenerate response"; + canceled = true; + } else { + perform(true); + } +} + +async function perform(regen) { + if(regen) { + document.getElementById("message").value = conversation.pop().user; + current_message--; + } + var slot_id = parseInt(document.getElementById("slot_id").value); + var temperature = parseFloat(document.getElementById("temperature").value); + var prompt = " " + document.getElementById("message").value; + if (prompt.length > 1 && !isNaN(slot_id) && !isNaN(temperature)) { + if(!regen && canceled) { // use the new message + conversation.pop(); // delete incomplete interaction + current_message--; + } + canceled = false; + let options = { + slot_id, + temperature + }; + if(document.getElementById("system_promt_cb").checked) { + let system_prompt = document.getElementById("sp_text").value; + let anti_prompt = document.getElementById("user_name").value; + let assistant_name_ = document.getElementById("assistant_name").value; + if(!system_prompt || !anti_prompt || !assistant_name_) { + document.getElementById("conversation_view").innerText = + "please, insert valid props."; + return; + } + conversation = []; + current_message = -1; + document.getElementById("system_promt_cb").checked = false; + document.getElementById("system_promt_cb").dispatchEvent(new Event("change")); + // include system prompt props + options.system_prompt = system_prompt; + options.anti_prompt = anti_prompt; + options.assistant_name = assistant_name_; + user_name = anti_prompt; + assistant_name = assistant_name_; + } + current_message++; + conversation.push({ + user: prompt, + assistant: "", + }); + updateView(); + document.getElementById("message").value = ""; + document.getElementById("btn_send").disabled = true; + document.getElementById("btn_cancel").disabled = false; + document.getElementById("btn_cancel").innerText = "Cancel"; + options.prompt = generatePrompt(); + await call_llama(options); + } else { + alert("please, insert valid props."); + } +} +)"; diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp new file mode 100644 index 0000000000000..b98477856d3af --- /dev/null +++ b/examples/server-parallel/server.cpp @@ -0,0 +1,924 @@ +#include "frontend.h" +#include "common.h" +#include "llama.h" +#include "build-info.h" + +#include "httplib.h" +#include "json.hpp" + +#include +#include +#include +#include +#include + +using namespace httplib; +using namespace std; +using namespace nlohmann; + +struct server_params +{ + std::string hostname = "127.0.0.1"; + std::string public_path = "examples/server/public"; + int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; +}; + +// utils functions taken of examples/server + +static bool ends_with(const std::string &str, const std::string &suffix) +{ + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +} + +static size_t find_partial_stop_string(const std::string &stop, + const std::string &text) +{ + if (!text.empty() && !stop.empty()) + { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) + { + if (stop[char_index] == text_last_char) + { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) + { + return text.size() - char_index - 1; + } + } + } + } + return std::string::npos; +} + +enum stop_type +{ + STOP_FULL, + STOP_PARTIAL, +}; + +enum slot_state +{ + IDLE, + PROCESSING +}; + +enum slot_command { + NONE, + LOAD_PROMPT, + RELEASE +}; + + +struct llama_client_slot +{ + int id; + int32_t n_prompt = 0; + int32_t n_decoded = 0; + int32_t i_batch = -1; + string prompt = ""; + string generated_text = ""; + int n_tokens_predicted = 0; + llama_token sampled; + std::vector sampled_tokens; + std::vector tokens_prev; + slot_state state = IDLE; + slot_command command = NONE; + float temperature = 0.1f; + + void start(string prompt_, float temp_) { + prompt = prompt_; + command = LOAD_PROMPT; + temperature = temp_; + LOG_TEE("slot %i is processing\n", id); + } + + bool hasNewToken() { + return sampled_tokens.size() > 0; + } + + bool available() { + return state == IDLE && command == NONE; + } + + void addTokenString(string token) { + if(command == RELEASE) { + sampled_tokens.clear(); + return; + } + sampled_tokens.insert(sampled_tokens.begin(), token); + n_tokens_predicted++; + } + + void release() { + if(state == PROCESSING) { + command = RELEASE; + } + } +}; + +struct server_parallel_context { + // example props + vector slots; + std::string system_prompt = ""; + bool update_system_prompt = true; + + // broadcast to all clients to keep the same prompt format + std::string user_name = ""; // this should be the anti prompt + std::string assistant_name = ""; // this is for generate the prompt + + // llama native props + gpt_params params; + llama_model *model = NULL; + llama_context *ctx = NULL; + int n_ctx; + int n_vocab; + std::vector candidates; + std::vector tokens_system; + int32_t n_tokens_system = 0; + bool all_slots_are_idle = false; + llama_batch batch; + + bool loadModel(gpt_params params_) { + params = params_; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr) + { + LOG_TEE("unable to load model: %s", params.model.c_str()); + return false; + } + n_ctx = llama_n_ctx(ctx); + n_vocab = llama_n_vocab(model); + candidates.reserve(n_vocab); + return true; + } + + void initialize() { + // create slots + LOG_TEE("Available slots:\n"); + for (int i = 0; i < params.n_parallel; i++) + { + llama_client_slot slot; + slot.id = i; + slot.prompt = "default"; + slot.state = IDLE; + slot.tokens_prev.resize(params.n_predict); + std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0); + LOG_TEE(" - slot %i\n", slot.id); + slots.push_back(slot); + } + batch = llama_batch_init(params.n_ctx, 0); + + // always assign a default system prompt + system_prompt = system_prompt_default; + user_name = "User:"; + assistant_name = "Assistant:"; + params.antiprompt.push_back(user_name); + all_slots_are_idle = true; + } + + void updateSystemPrompt() { + tokens_system = ::llama_tokenize(ctx, system_prompt, true); + n_tokens_system = tokens_system.size(); + + batch.n_tokens = n_tokens_system; + + // clear the entire KV cache + for (int i = 0; i < params.n_parallel; ++i) + { + llama_kv_cache_seq_rm(ctx, i, 0, -1); + } + + for (int32_t i = 0; i < batch.n_tokens; ++i) + { + batch.token[i] = tokens_system[i]; + batch.pos[i] = i; + batch.seq_id[i] = 0; + batch.logits[i] = false; + } + + if (llama_decode(ctx, batch) != 0) + { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } + + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i < params.n_parallel; ++i) + { + llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system); + } + + LOG_TEE("system prompt updated\n"); + update_system_prompt = false; + } + + void notifySystemPromptChanged() { + // release all slots + for (llama_client_slot &slot : slots) + { + slot.release(); + } + waitAllAreIdle(); + all_slots_are_idle = true; + // wait until system prompt load + update_system_prompt = true; + while(update_system_prompt) { + this_thread::sleep_for(chrono::milliseconds(5)); + } + // system prompt loaded, continue + } + + llama_client_slot* requestCompletion(json data) { + if(data.contains("system_prompt") && + data.contains("anti_prompt") && + data.contains("assistant_name")) { + system_prompt = data.value("system_prompt", ""); + user_name = data.value("anti_prompt", ""); + assistant_name = data.value("assistant_name", ""); + params.antiprompt.clear(); + params.antiprompt.push_back(user_name); + notifySystemPromptChanged(); + } + int slot_id = data.value("slot_id", -1); + float temperature = data.value("temperature", 0.1f); + string prompt = data.value("prompt", ""); + for (llama_client_slot & slot : slots) + { + if ((slot_id == -1 && slot.available()) || slot.id == slot_id) + { + slot.start(prompt, temperature); + all_slots_are_idle = false; + return &slot; // return a pointer to slot (thread safe?) + } + } + return nullptr; + } + + size_t findStoppingStrings(const std::string &text, const size_t last_token_size, + const stop_type type) + { + size_t stop_pos = std::string::npos; + for (const std::string &word : params.antiprompt) + { + size_t pos; + if (type == STOP_FULL) + { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } + else + { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) + { + stop_pos = pos; + } + } + return stop_pos; + } + + void waitAllAreIdle() { + bool wait = true; + while(wait) { + wait = false; + for (auto &slot : slots) + { + if (!slot.available()) + { + wait = true; + break; + } + } + } + } + + bool updateSlots() { + + // update the system prompt wait until all slots are idle state + if(update_system_prompt) { + updateSystemPrompt(); + } + + batch.n_tokens = 0; + int kv_cache_free = (n_ctx - n_tokens_system); + if(all_slots_are_idle) { + // avoid 100% usage of cpu all time + this_thread::sleep_for(chrono::milliseconds(5)); + } + // decode any currently ongoing sequences + for (auto & slot : slots) { + if (slot.state == PROCESSING && slot.command == RELEASE && !slot.hasNewToken()) + { + LOG_TEE("slot %i released\n", slot.id); + llama_kv_cache_seq_rm(ctx, slot.id, n_tokens_system, n_ctx); + slot.state = IDLE; + slot.command = NONE; + slot.n_prompt = 0; + slot.n_tokens_predicted = 0; + continue; + } + + kv_cache_free -= slot.n_prompt; + + if (slot.state == IDLE || slot.command == RELEASE) { + continue; + } + + batch.token [batch.n_tokens] = slot.sampled; + batch.pos [batch.n_tokens] = n_tokens_system + slot.n_prompt + slot.n_decoded; + batch.seq_id[batch.n_tokens] = slot.id; + batch.logits[batch.n_tokens] = true; + + slot.n_decoded += 1; + slot.i_batch = batch.n_tokens; + + batch.n_tokens += 1; + } + + // assign workload to the slots + if (params.cont_batching || batch.n_tokens == 0) { + for (llama_client_slot & slot : slots) { + // need process the prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT) { + slot.state = PROCESSING; + slot.command = NONE; + + std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0); + + // do not prepend BOS because we have a system prompt! + std::vector tokens_prompt; + tokens_prompt = ::llama_tokenize(ctx, slot.prompt, false); + + for (size_t i = 0; i < tokens_prompt.size(); ++i) { + batch.token [batch.n_tokens] = tokens_prompt[i]; + batch.pos [batch.n_tokens] = i + n_tokens_system; + batch.seq_id[batch.n_tokens] = slot.id; + batch.logits[batch.n_tokens] = false; + batch.n_tokens += 1; + } + + // extract the logits only for the last token + if (batch.n_tokens > 0) { + batch.logits[batch.n_tokens - 1] = true; + } + + slot.n_prompt = tokens_prompt.size(); + slot.n_decoded = 0; + slot.i_batch = batch.n_tokens - 1; + } + } + } + + if (batch.n_tokens == 0) { + all_slots_are_idle = true; + return true; + } + + // process in chunks of params.n_batch + int32_t n_batch = params.n_batch; + + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + + const int ret = llama_decode(ctx, batch_view); + if (ret != 0) { + if (n_batch == 1 || ret < 0) { + // if you get here, it means the KV cache is full - try increasing it via the context size + LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + return false; + } + + LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); + + // retry with half the batch size to try to find a free slot in the KV cache + n_batch /= 2; + i -= n_batch; + continue; + } + + for (auto & slot : slots) { + if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { + continue; + } + + params.temp = slot.temperature; + const llama_token id = llama_sample_token(ctx, NULL, NULL, params, slot.tokens_prev, candidates, slot.i_batch - i); + + // remember which tokens were sampled - used for repetition penalties during sampling + slot.tokens_prev.erase(slot.tokens_prev.begin()); + slot.tokens_prev.push_back(id); + + const std::string token_str = llama_token_to_piece(ctx, id); + slot.generated_text += token_str; + slot.sampled = id; + + size_t stop_pos = + findStoppingStrings(slot.generated_text, token_str.size(), STOP_FULL); + + slot.addTokenString(token_str); + + kv_cache_free -= slot.n_tokens_predicted; + + if (slot.n_decoded > 2 && + (id == llama_token_eos(ctx) || + (slot.n_decoded + slot.n_prompt >= + params.n_predict) || + stop_pos != std::string::npos)) { + //LOG_TEE("slot %i generated text:\n%s'------------------------------\n", slot.id, slot.generated_text.c_str()); + slot.generated_text.clear(); + slot.release(); + } + slot.i_batch = -1; + } + } + + if(kv_cache_free < 0) { + LOG_TEE("\nError: kv cache is full, increase context size."); + return false; + } + return true; + } +}; + +static void server_print_usage(const char *argv0, const gpt_params ¶ms, + const server_params &sparams) +{ + printf("usage: %s [options]\n", argv0); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + if (llama_mlock_supported()) + { + printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) + { + printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } + printf(" --numa attempt optimizations that help on some NUMA systems\n"); +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + printf(" -nommq, --no-mul-mat-q\n"); + printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); + printf(" Not recommended since this is both slower and uses more VRAM.\n"); +#endif + printf(" -m FNAME, --model FNAME\n"); + printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -a ALIAS, --alias ALIAS\n"); + printf(" set an alias for the model, will be added as `model` field in completion response\n"); + printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); + printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + + // new arguments + printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); + printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); + printf(" -r ANTI_PROMPT, --reverse-prompt ANTI_PROMPT\n"); + printf(" set a anti prompt, used as user name in prompt generation\n"); + printf("\n"); +} + +static void server_params_parse(int argc, char **argv, server_params &sparams, + gpt_params ¶ms) +{ + gpt_params default_params; + server_params default_sparams; + std::string arg; + bool invalid_param = false; + + for (int i = 1; i < argc; i++) + { + arg = argv[i]; + if (arg == "--port") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.port = std::stoi(argv[i]); + } + else if (arg == "--host") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.hostname = argv[i]; + } + else if (arg == "--path") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.public_path = argv[i]; + } + else if (arg == "--timeout" || arg == "-to") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } + else if (arg == "-m" || arg == "--model") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.model = argv[i]; + } + else if (arg == "-a" || arg == "--alias") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } + else if (arg == "-h" || arg == "--help") + { + server_print_usage(argv[0], default_params, default_sparams); + exit(0); + } + else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } + else if (arg == "--rope-freq-base") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } + else if (arg == "--rope-freq-scale") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); + } + else if (arg == "--memory-f32" || arg == "--memory_f32") + { + params.memory_f16 = false; + } + else if (arg == "--threads" || arg == "-t") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } + else if (arg == "-b" || arg == "--batch-size") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } + else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") + { + if (++i >= argc) + { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers = std::stoi(argv[i]); +#else + LOG_TEE("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " + "See main README.md for information on enabling GPU BLAS support\n"); +#endif + } + else if (arg == "--tensor-split" || arg == "-ts") + { + if (++i >= argc) + { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; + std::vector split_arg{it, {}}; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) + { + if (i_device < split_arg.size()) + { + params.tensor_split[i_device] = std::stof(split_arg[i_device]); + } + else + { + params.tensor_split[i_device] = 0.0f; + } + } +#else + LOG_TEE("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--no-mul-mat-q" || arg == "-nommq") + { +#ifdef GGML_USE_CUBLAS + params.mul_mat_q = false; +#else + LOG_TEE("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--main-gpu" || arg == "-mg") + { + if (++i >= argc) + { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + params.main_gpu = std::stoi(argv[i]); +#else + LOG_TEE("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU."); +#endif + } + else if (arg == "--lora") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter.push_back({argv[i], 1.0f}); + params.use_mmap = false; + } + else if (arg == "--lora-scaled") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + const char * lora_adapter = argv[i]; + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter.push_back(make_tuple(lora_adapter, std::stof(argv[i]))); + params.use_mmap = false; + } + else if (arg == "--lora-base") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } + else if (arg == "--mlock") + { + params.use_mlock = true; + } + else if (arg == "--no-mmap") + { + params.use_mmap = false; + } + else if (arg == "--numa") + { + params.numa = true; + } else if (arg == "-cb" || arg == "--cont-batching") + { + params.cont_batching = true; + } + else if (arg == "-np" || arg == "--parallel") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_parallel = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--n-predict") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + if(params.n_predict <= 128) { // this example don't support long prompts + params.n_predict = 128; + } + } else if (arg == "-r" || arg == "--reverse-prompt") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.antiprompt.push_back(argv[i]); + } + else + { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } + } + + if (invalid_param) + { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } +} + +int main(int argc, char **argv) +{ + gpt_params params; + + server_params sparams; + + server_params_parse(argc, argv, sparams, params); + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("server-parallel", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + LOG_TEE("%s: seed = %u\n", __func__, params.seed); + + llama_backend_init(params.numa); + + // load the target model + params.logits_all = true; + server_parallel_context llama; + + LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); + + if(!llama.loadModel(params)) { + return 1; + } + + // print system information + { + LOG_TEE("\n"); + LOG_TEE("%s\n", get_system_info(params).c_str()); + } + + llama.initialize(); + + Server svr; + + svr.Options("/(.*)", + [&](const Request & /*req*/, Response &res) { + res.set_header("Access-Control-Allow-Methods", "*"); + res.set_header("Access-Control-Allow-Headers", "content-type"); + res.set_header("Access-Control-Allow-Origin", "*"); + }); + + svr.Get("/", [&](const Request & /*req*/, Response &res) + { res.set_content(index_html_, "text/html"); }); + + svr.Get("/index.js", [&](const Request & /*req*/, Response &res) + { res.set_content(index_js_, "text/html"); }); + + svr.Get("/props", [&llama](const Request & /*req*/, Response &res) + { + res.set_header("Access-Control-Allow-Origin", "*"); + json data = { + { "user_name", llama.user_name.c_str() }, + { "assistant_name", llama.assistant_name.c_str() } + }; + res.set_content(data.dump(), "application/json"); }); + + svr.Post("/completion", [&llama](const Request &req, Response &res) + { + res.set_header("Access-Control-Allow-Origin", "*"); + llama_client_slot* slot = llama.requestCompletion(json::parse(req.body)); + // Verify if the slot exist + if (slot) { + auto content_provider = [slot](size_t /*offset*/, DataSink &sink) { + if(slot->hasNewToken()) { // new token notification + stringstream ss; + json res_d = { + { "content", slot->sampled_tokens.back() }, + { "slot_id", slot->id }}; + slot->sampled_tokens.pop_back(); + ss << "data: " << res_d.dump() << "\n\n"; + string result = ss.str(); + if(!sink.write(result.c_str(), result.size())) { + slot->release(); + return false; + } + } else { + this_thread::sleep_for(chrono::milliseconds(5)); + } + if(slot->available()) { // slot has been released + sink.done(); + return false; + } + return true; + }; + auto on_complete = [slot] (bool) { + slot->sampled_tokens.clear(); + slot->release(); + }; + res.set_chunked_content_provider("text/event-stream", content_provider, on_complete); + } else { + LOG_TEE("slot unavailable\n"); + res.status = 404; + res.set_content("slot_error", "text/plain"); + } }); + + thread t([&llama]() + { + bool running = true; + while (running) + { + running = llama.updateSlots(); + } }); + + svr.set_read_timeout(sparams.read_timeout); + svr.set_write_timeout(sparams.write_timeout); + + if (!svr.bind_to_port(sparams.hostname, sparams.port)) + { + fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port); + return 1; + } + + // Set the base directory for serving static files + svr.set_base_dir(sparams.public_path); + + // to make it ctrl+clickable: + printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); + + if (!svr.listen_after_bind()) + { + return 1; + } +} diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 3782f9b80ab82..407cf7b5de253 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,7 +1,7 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -add_executable(${TARGET} server.cpp json.hpp httplib.h) +add_executable(${TARGET} server.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$