From a7a6ceb7ae20a5bd70d05c0438002e6664fa1be0 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Thu, 5 Oct 2023 15:12:39 -0400 Subject: [PATCH 01/19] server handling multiple clients with cam --- examples/CMakeLists.txt | 1 + examples/server-parallel/CMakeLists.txt | 15 + examples/server-parallel/README.md | 73 ++ examples/server-parallel/frontend.h | 263 +++++++ examples/server-parallel/server.cpp | 871 ++++++++++++++++++++++++ 5 files changed, 1223 insertions(+) create mode 100644 examples/server-parallel/CMakeLists.txt create mode 100644 examples/server-parallel/README.md create mode 100644 examples/server-parallel/frontend.h create mode 100644 examples/server-parallel/server.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index de4cf7a691768..4741b7dec20f4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -35,6 +35,7 @@ else() endif() if (LLAMA_BUILD_SERVER) add_subdirectory(server) + add_subdirectory(server-parallel) endif() add_subdirectory(export-lora) endif() diff --git a/examples/server-parallel/CMakeLists.txt b/examples/server-parallel/CMakeLists.txt new file mode 100644 index 0000000000000..48c29e9ad18fe --- /dev/null +++ b/examples/server-parallel/CMakeLists.txt @@ -0,0 +1,15 @@ +set(TARGET server-parallel) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +add_executable(${TARGET} server.cpp ../server/json.hpp ../server/httplib.h) +install(TARGETS ${TARGET} RUNTIME) +target_compile_definitions(${TARGET} PRIVATE + SERVER_VERBOSE=$ +) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +if (WIN32) + TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) +endif() +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/server-parallel/README.md b/examples/server-parallel/README.md new file mode 100644 index 0000000000000..b4181f28be759 --- /dev/null +++ b/examples/server-parallel/README.md @@ -0,0 +1,73 @@ +# llama.cpp/example/server-parallel + +This example demonstrates a PoC HTTP API server that handles simulataneus requests. Long prompts are not supported. + +## Quick Start + +To get started right away, run the following command, making sure to use the correct path for the model you have: + +### Unix-based systems (Linux, macOS, etc.): + +```bash +./server-parallel -m models/7B/ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching +``` + +### Windows: + +```powershell +server-parallel.exe -m models\7B\ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching +``` +The above command will start a server that by default listens on `127.0.0.1:8080`. + +## API Endpoints + +- **GET** `/props`: Return the user and assistant name for generate the prompt. + +*Response:* +```json +{ + "user_name": "User:", + "assistant_name": "Assistant:" +} +``` + +- **POST** `/completion`: Given a prompt, it returns the predicted completion, just streaming mode. + + *Options:* + + `temperature`: Adjust the randomness of the generated text (default: 0.1). + + `prompt`: Provide a prompt as a string, It should be a coherent continuation of the system prompt. + + `system_prompt`: Provide a system prompt as a string. + + `anti_prompt`: Provide the name of the user coherent with the system prompt. + + `assistant_name`: Provide the name of the assistant coherent with the system prompt. + +*Example request:* +```json +{ + // this changes the system prompt on runtime + "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. + +Human: Hello +Assistant: Hi, how may I help you? +Human:", + "anti_prompt": "Human:", + "assistant_name": "Assistant:", + + // required options + "prompt": "When is the day of independency of US?", + "temperature": 0.2 +} +``` + +*Response:* +```json +{ + "content": "" +} +``` + +# This example is a Proof of Concept, have some bugs and unexpected behaivors, this not supports long prompts. diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h new file mode 100644 index 0000000000000..e3656ea218da0 --- /dev/null +++ b/examples/server-parallel/frontend.h @@ -0,0 +1,263 @@ +const char* system_prompt_default = +R"(Transcript of a never ending dialog, where the User interacts with an Assistant. +The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. +User: Recommend a nice restaurant in the area. +Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays. +User: Who is Richard Feynman? +Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?". +User:)"; + +const char* index_html_ = R"( + + + + llama.cpp - server parallel PoC + + + +
+

Server parallel - PoC

+
+ + +
+ +
+ + +
+ + +
+ + +

+ +
+
+ +
+
+
+
+ + +)"; + +const char* index_js_ = R"( + let conversation = []; + let current_message = -1; + +const questions = [ + "Who is Elon Musk?", + "Who is Jeff Bezos?", + "How to get a job at google?", + "What are you?", + "When was born Abraham Lincoln?", +]; + +let user_name = ""; +let assistant_name = ""; + +function toggleSP() { + if(document.getElementById("system_promt_cb").checked) { + document.getElementById("system_prompt_view").style.display = "block"; + } else { + document.getElementById("system_prompt_view").style.display = "none"; + } +} + +function clearSP() { + document.getElementById("sp_text").value = ""; + document.getElementById("anti_prompt").value = ""; + document.getElementById("assistant_name").value = ""; +} + +docReady(async () => { + document.getElementById("message").value = + questions[Math.floor(Math.random() * questions.length)]; + + // to keep the same prompt format in all clients + const response = await fetch("/props"); + if (!response.ok) { + alert(`HTTP error! Status: ${response.status}`); + } + const data = await response.json(); + user_name = data.user_name; + assistant_name = data.assistant_name; +}); + +function docReady(fn) { + // see if DOM is already available + if ( + document.readyState === "complete" || + document.readyState === "interactive" + ) { + // call on next available tick + setTimeout(fn, 1); + } else { + document.addEventListener("DOMContentLoaded", fn); + } +} + +function updateView() { + let conv_view = document.getElementById("conversation_view"); + // build view + conv_view.innerHTML = ""; + for (let index in conversation) { + conversation[index].assistant = conversation[index].assistant.replace( + user_name, + "" + ); + conv_view.innerHTML += ` +

User: ${conversation[index].user}

+

Assistant: ${conversation[index].assistant}

`; + } +} + +async function call_llama(options) { + const response = await fetch("/completion", { + method: "POST", + body: JSON.stringify(options), + headers: { + Connection: "keep-alive", + "Content-Type": "application/json", + Accept: "text/event-stream", + }, + }); + + const reader = response.body.getReader(); + let cont = true; + const decoder = new TextDecoder(); + let leftover = ""; // Buffer for partially read lines + + try { + let cont = true; + + while (cont) { + const result = await reader.read(); + if (result.done) { + document.getElementById("btn_send").disabled = false; + break; + } + + // Add any leftover data to the current chunk of data + const text = leftover + decoder.decode(result.value); + + // Check if the last character is a line break + const endsWithLineBreak = text.endsWith("\n"); + + // Split the text into lines + let lines = text.split("\n"); + + // If the text doesn't end with a line break, then the last line is incomplete + // Store it in leftover to be added to the next chunk of data + if (!endsWithLineBreak) { + leftover = lines.pop(); + } else { + leftover = ""; // Reset leftover if we have a line break at the end + } + + // Parse all sse events and add them to result + const regex = /^(\S+):\s(.*)$/gm; + for (const line of lines) { + const match = regex.exec(line); + if (match) { + result[match[1]] = match[2]; + // since we know this is llama.cpp, let's just decode the json in data + if (result.data) { + result.data = JSON.parse(result.data); + conversation[current_message].assistant += result.data.content; + updateView(); + } + } + } + } + } catch (e) { + if (e.name !== "AbortError") { + console.error("llama error: ", e); + } + throw e; + } +} + +function generatePrompt() { + // generate a good prompt to have coherence + let prompt = ""; + for (let index in conversation) { + if (index == 0) { + prompt += conversation[index].user + "\n"; + } else { + prompt += user_name + conversation[index].user + "\n"; + } + if (index == current_message) { + prompt += assistant_name; + } else { + prompt += assistant_name + conversation[index].assistant; + } + } + return prompt; +} + +function resetBtn() { + document.getElementById("slot_id").value = "-1"; + document.getElementById("temperature").value = "0.1"; + document.getElementById("message").value = + questions[Math.floor(Math.random() * questions.length)]; + document.getElementById("conversation_view").innerHTML = ""; + conversation = []; + current_message = -1; +} + +async function perform() { + var slot_id = parseInt(document.getElementById("slot_id").value); + var temperature = parseFloat(document.getElementById("temperature").value); + var prompt = " " + document.getElementById("message").value; + if (!isNaN(slot_id) && !isNaN(temperature) && prompt.length > 0) { + let options = { + slot_id, + temperature + }; + if(document.getElementById("system_promt_cb").checked) { + let system_prompt = document.getElementById("sp_text").value; + let anti_prompt = document.getElementById("user_name").value; + let assistant_name_ = document.getElementById("assistant_name").value; + if(!system_prompt || !anti_prompt || !assistant_name_) { + document.getElementById("conversation_view").innerText = + "please, insert valid props."; + return; + } + conversation = []; + current_message = -1; + document.getElementById("system_promt_cb").checked = false; + document.getElementById("system_promt_cb").dispatchEvent(new Event("change")); + options.system_prompt = system_prompt; + options.anti_prompt = anti_prompt; + options.assistant_name = assistant_name_; + user_name = anti_prompt; + assistant_name = assistant_name_; + } + current_message++; + conversation.push({ + user: prompt, + assistant: "", + }); + updateView(); + document.getElementById("message").value = ""; + document.getElementById("btn_send").disabled = true; + options.prompt = generatePrompt(); + await call_llama(options); + } else { + document.getElementById("conversation_view").innerText = + "please, insert valid props."; + } +} + +)"; diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp new file mode 100644 index 0000000000000..6bd6dadc9d459 --- /dev/null +++ b/examples/server-parallel/server.cpp @@ -0,0 +1,871 @@ +#include +#include "../server/httplib.h" +#include "../server/json.hpp" +#include +#include +#include +#include +#include "frontend.h" +#include "common.h" +#include "llama.h" + +using namespace httplib; +using namespace std; +using namespace nlohmann; + +struct server_params +{ + std::string hostname = "127.0.0.1"; + std::string public_path = "examples/server/public"; + int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; +}; + +// utils functions taken of examples/server + +static bool ends_with(const std::string &str, const std::string &suffix) +{ + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +} + +static size_t find_partial_stop_string(const std::string &stop, + const std::string &text) +{ + if (!text.empty() && !stop.empty()) + { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) + { + if (stop[char_index] == text_last_char) + { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) + { + return text.size() - char_index - 1; + } + } + } + } + return std::string::npos; +} + +enum stop_type +{ + STOP_FULL, + STOP_PARTIAL, +}; + +enum slot_state +{ + IDLE, + PROCESSING +}; + +enum slot_command { + NONE, + LOAD_PROMPT, + RELEASE +}; + + +struct llama_client_slot +{ + int id; + int32_t n_prompt = 0; + int32_t n_decoded = 0; + int32_t i_batch = -1; + string prompt = ""; + string sampled_token_str; + string generated_text = ""; + llama_token sampled; + std::vector tokens_prev; + slot_state state = IDLE; + slot_command command = NONE; + bool newToken = false; + float temperature = 0.1f; + + void start(string prompt_, float temp_) { + prompt = prompt_; + command = LOAD_PROMPT; + temperature = temp_; + newToken = false; + } + + bool hasNewToken() { + if(newToken) { + newToken = false; + return true; + } + return false; + } + + bool available() { + return state == IDLE && command == NONE; + } + + void nofity() { + newToken = !newToken; + } + + void release() { + if(state == PROCESSING) { + command = RELEASE; + } + } +}; + +struct server_parallel_context { + // example props + vector slots; + std::string system_prompt = ""; + bool update_system_prompt = true; + + // broadcast to all clients to keep the same prompt format + std::string user_name = ""; // this should be the anti prompt + std::string assistant_name = ""; // this is for generate the prompt + + // llama native props + gpt_params params; + llama_model *model = NULL; + llama_context *ctx = NULL; + int n_ctx; + int n_vocab; + std::vector candidates; + std::vector tokens_system; + int32_t n_tokens_system = 0; + llama_batch batch; + + bool loadModel(gpt_params params_) { + params = params_; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr) + { + LOG_TEE("unable to load model: %s", params.model.c_str()); + return false; + } + n_ctx = llama_n_ctx(ctx); + n_vocab = llama_n_vocab(model); + candidates.reserve(n_vocab); + return true; + } + + void initialize() { + // create slots + LOG_TEE("Available slots:\n"); + for (int i = 0; i < params.n_parallel; i++) + { + llama_client_slot slot; + slot.id = i; + slot.prompt = "default"; + slot.state = IDLE; + slot.tokens_prev.resize(std::max(256, params.n_predict)); + std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0); + LOG_TEE(" - slot %i\n", slot.id); + slots.push_back(slot); + } + batch = llama_batch_init(params.n_ctx, 0); + + // always assign a default system prompt + system_prompt = system_prompt_default; + user_name = "User:"; + assistant_name = "Assistant:"; + params.antiprompt.push_back(user_name); + } + + void updateSystemPrompt() { + tokens_system = ::llama_tokenize(ctx, system_prompt, true); + n_tokens_system = tokens_system.size(); + + batch.n_tokens = n_tokens_system; + + // clear the entire KV cache + for (int i = 0; i < params.n_parallel; ++i) + { + llama_kv_cache_seq_rm(ctx, i, 0, -1); + } + + for (int32_t i = 0; i < batch.n_tokens; ++i) + { + batch.token[i] = tokens_system[i]; + batch.pos[i] = i; + batch.seq_id[i] = 0; + batch.logits[i] = false; + } + + if (llama_decode(ctx, batch) != 0) + { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } + + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i < params.n_parallel; ++i) + { + llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system); + } + + LOG_TEE("system prompt updated\n"); + update_system_prompt = false; + } + + void notifySystemPromptChanged() { + // release all slots + for (llama_client_slot &slot : slots) + { + slot.release(); + } + waitAllAreIdle(); + // wait until system prompt load + update_system_prompt = true; + while(update_system_prompt) { + this_thread::sleep_for(chrono::milliseconds(5)); + } + // system prompt loaded, continue + } + + llama_client_slot* requestCompletion(json data) { + if(data.contains("system_prompt") && + data.contains("anti_prompt") && + data.contains("assistant_name")) { + system_prompt = data.value("system_prompt", ""); + user_name = data.value("anti_prompt", ""); + assistant_name = data.value("assistant_name", ""); + params.antiprompt.clear(); + params.antiprompt.push_back(user_name); + notifySystemPromptChanged(); + } + int slot_id = data.value("slot_id", -1); + float temperature = data.value("temperature", 0.1f); + string prompt = data.value("prompt", ""); + for (llama_client_slot & slot : slots) + { + if ( + slot_id == -1 && slot.available() || + slot.id == slot_id) + { + slot.start(prompt, temperature); + LOG_TEE("slot %i is processing\n", slot.id); + return &slot; // return a pointer to slot (thread safe?) + } + } + return nullptr; + } + + size_t findStoppingStrings(const std::string &text, const size_t last_token_size, + const stop_type type) + { + size_t stop_pos = std::string::npos; + for (const std::string &word : params.antiprompt) + { + size_t pos; + if (type == STOP_FULL) + { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } + else + { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) + { + stop_pos = pos; + } + } + return stop_pos; + } + + void waitAllAreIdle() { + bool wait = true; + while(wait) { + wait = false; + for (auto &slot : slots) + { + if (!slot.available()) + { + wait = true; + break; + } + } + } + } + + bool updateSlots() { + // update the system prompt wait until all slots are idle state + if(update_system_prompt) { + updateSystemPrompt(); + } + + batch.n_tokens = 0; + + // decode any currently ongoing sequences + for (auto & slot : slots) { + if (slot.state == PROCESSING && slot.command == RELEASE) + { + LOG_TEE("slot %i released\n", slot.id); + llama_kv_cache_seq_rm(ctx, slot.id, n_tokens_system, n_ctx); + slot.state = IDLE; + slot.command = NONE; + continue; + } + + // no decode wait until the token had been send to client + // improves performance and avoid decoherence? + + if (slot.state == IDLE || slot.newToken) { + continue; + } + + batch.token [batch.n_tokens] = slot.sampled; + batch.pos [batch.n_tokens] = n_tokens_system + slot.n_prompt + slot.n_decoded; + batch.seq_id[batch.n_tokens] = slot.id; + batch.logits[batch.n_tokens] = true; + + slot.n_decoded += 1; + slot.i_batch = batch.n_tokens; + + batch.n_tokens += 1; + } + + // assign workload to the slots + if (params.cont_batching || batch.n_tokens == 0) { + for (llama_client_slot & slot : slots) { + // need process the prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT) { + slot.state = PROCESSING; + slot.command = NONE; + //LOG_TEE("slot %i process prompt:\n%s%s'------------------------------\n", slot.id, system_prompt.c_str(), slot.prompt.c_str()); + std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0); + + // do not prepend BOS because we have a system prompt! + std::vector tokens_prompt; + tokens_prompt = ::llama_tokenize(ctx, slot.prompt, false); + + for (size_t i = 0; i < tokens_prompt.size(); ++i) { + batch.token [batch.n_tokens] = tokens_prompt[i]; + batch.pos [batch.n_tokens] = i + n_tokens_system; + batch.seq_id[batch.n_tokens] = slot.id; + batch.logits[batch.n_tokens] = false; + batch.n_tokens += 1; + } + + // extract the logits only for the last token + if (batch.n_tokens > 0) { + batch.logits[batch.n_tokens - 1] = true; + } + + slot.n_prompt = tokens_prompt.size(); + slot.n_decoded = 0; + slot.i_batch = batch.n_tokens - 1; + + // insert new requests one-by-one + //if (cont_batching) { + // break; + //} + } + } + } + + if (batch.n_tokens == 0) { + return true; + } + + // process in chunks of params.n_batch + int32_t n_batch = params.n_batch; + + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { + // experiment: process in powers of 2 + //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) { + // n_batch /= 2; + // i -= n_batch; + // continue; + //} + + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + + const int ret = llama_decode(ctx, batch_view); + if (ret != 0) { + if (n_batch == 1 || ret < 0) { + // if you get here, it means the KV cache is full - try increasing it via the context size + LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + return false; + } + + LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); + + // retry with half the batch size to try to find a free slot in the KV cache + n_batch /= 2; + i -= n_batch; + continue; + } + + for (auto & slot : slots) { + if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { + continue; + } + + params.temp = slot.temperature; + const llama_token id = llama_sample_token(ctx, NULL, NULL, params, slot.tokens_prev, candidates, slot.i_batch - i); + + // remember which tokens were sampled - used for repetition penalties during sampling + slot.tokens_prev.erase(slot.tokens_prev.begin()); + slot.tokens_prev.push_back(id); + + const std::string token_str = llama_token_to_piece(ctx, id); + slot.generated_text += token_str; + slot.sampled = id; + + size_t pos = 0; + + size_t stop_pos = + findStoppingStrings(slot.generated_text, token_str.size(), STOP_FULL); + + slot.sampled_token_str = token_str; + // notify new token + slot.nofity(); + + if (slot.n_decoded > 2 && + (id == llama_token_eos(ctx) || + (params.n_predict > 0 && + slot.n_decoded + slot.n_prompt >= + params.n_predict) || + stop_pos != std::string::npos)) { + //LOG_TEE("slot %i generated text:\n%s'------------------------------\n", slot.id, slot.generated_text.c_str()); + slot.generated_text.clear(); + slot.release(); + } + slot.i_batch = -1; + } + } + return true; + } +}; + +static void server_print_usage(const char *argv0, const gpt_params ¶ms, + const server_params &sparams) +{ + printf("usage: %s [options]\n", argv0); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + if (llama_mlock_supported()) + { + printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) + { + printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } + printf(" --numa attempt optimizations that help on some NUMA systems\n"); +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + printf(" -nommq, --no-mul-mat-q\n"); + printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); + printf(" Not recommended since this is both slower and uses more VRAM.\n"); +#endif + printf(" -m FNAME, --model FNAME\n"); + printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -a ALIAS, --alias ALIAS\n"); + printf(" set an alias for the model, will be added as `model` field in completion response\n"); + printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); + printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + + // new arguments + printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); + printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); + printf(" -f FNAME, --file FNAME\n"); + printf(" load a system prompt from a file.\n"); + printf("\n"); +} + +static void server_params_parse(int argc, char **argv, server_params &sparams, + gpt_params ¶ms) +{ + gpt_params default_params; + server_params default_sparams; + std::string arg; + bool invalid_param = false; + + for (int i = 1; i < argc; i++) + { + arg = argv[i]; + if (arg == "--port") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.port = std::stoi(argv[i]); + } + else if (arg == "--host") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.hostname = argv[i]; + } + else if (arg == "--path") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.public_path = argv[i]; + } + else if (arg == "--timeout" || arg == "-to") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } + else if (arg == "-m" || arg == "--model") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.model = argv[i]; + } + else if (arg == "-a" || arg == "--alias") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } + else if (arg == "-h" || arg == "--help") + { + server_print_usage(argv[0], default_params, default_sparams); + exit(0); + } + else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } + else if (arg == "--rope-freq-base") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } + else if (arg == "--rope-freq-scale") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); + } + else if (arg == "--memory-f32" || arg == "--memory_f32") + { + params.memory_f16 = false; + } + else if (arg == "--threads" || arg == "-t") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } + else if (arg == "-b" || arg == "--batch-size") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } + else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") + { + if (++i >= argc) + { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers = std::stoi(argv[i]); +#else + LOG_TEE("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " + "See main README.md for information on enabling GPU BLAS support\n"); +#endif + } + else if (arg == "--tensor-split" || arg == "-ts") + { + if (++i >= argc) + { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; + std::vector split_arg{it, {}}; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) + { + if (i_device < split_arg.size()) + { + params.tensor_split[i_device] = std::stof(split_arg[i_device]); + } + else + { + params.tensor_split[i_device] = 0.0f; + } + } +#else + LOG_TEE("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--no-mul-mat-q" || arg == "-nommq") + { +#ifdef GGML_USE_CUBLAS + params.mul_mat_q = false; +#else + LOG_TEE("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--main-gpu" || arg == "-mg") + { + if (++i >= argc) + { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + params.main_gpu = std::stoi(argv[i]); +#else + LOG_TEE("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU."); +#endif + } + else if (arg == "--lora") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter.push_back({argv[i], 1.0f}); + params.use_mmap = false; + } + else if (arg == "--lora-scaled") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + const char * lora_adapter = argv[i]; + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter.push_back(make_tuple(lora_adapter, std::stof(argv[i]))); + params.use_mmap = false; + } + else if (arg == "--lora-base") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } + else if (arg == "--mlock") + { + params.use_mlock = true; + } + else if (arg == "--no-mmap") + { + params.use_mmap = false; + } + else if (arg == "--numa") + { + params.numa = true; + } else if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; + } else if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parallel = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + } + else + { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } + } + + if (invalid_param) + { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } +} + +int main(int argc, char **argv) +{ + gpt_params params; + + server_params sparams; + + server_params_parse(argc, argv, sparams, params); + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("server-parallel", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + + llama_backend_init(params.numa); + + // load the target model + params.logits_all = true; + server_parallel_context llama; + + if(!llama.loadModel(params)) { + return 1; + } + + llama.initialize(); + + Server svr; + + svr.Get("/", [&](const Request & /*req*/, Response &res) + { res.set_content(index_html_, "text/html"); }); + + svr.Get("/index.js", [&](const Request & /*req*/, Response &res) + { res.set_content(index_js_, "text/html"); }); + + svr.Get("/props", [&llama](const Request & /*req*/, Response &res) + { + json data = { + { "user_name", llama.user_name.c_str() }, + { "assistant_name", llama.assistant_name.c_str() } + }; + res.set_content(data.dump(), "application/json"); }); + svr.Post("/completion", [&llama](const Request &req, Response &res) + { + llama_client_slot* slot = llama.requestCompletion(json::parse(req.body)); + // Verify if the slot exist + if (slot) { + res.set_chunked_content_provider("text/event-stream", + [slot](size_t /*offset*/, DataSink &sink) { + if(slot->available()) { // slot has been released + sink.done(); + return false; + } + if(slot->hasNewToken()) { // new token notification + stringstream ss; + json res_d = {{ "content", slot->sampled_token_str }}; + ss << "data: " << res_d.dump() << "\n\n"; + string result = ss.str(); + if(!sink.write(result.c_str(), result.size())) { + slot->release(); + return false; + } + } + return true; + }); + } else { + LOG_TEE("slot unavailable\n"); + res.status = 404; + res.set_content("slot_error", "text/plain"); + } }); + + thread t([&llama]() + { + bool running = true; + while (running) + { + running = llama.updateSlots(); + } }); + + svr.set_read_timeout(sparams.read_timeout); + svr.set_write_timeout(sparams.write_timeout); + + if (!svr.bind_to_port(sparams.hostname, sparams.port)) + { + fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port); + return 1; + } + + // Set the base directory for serving static files + svr.set_base_dir(sparams.public_path); + + // to make it ctrl+clickable: + printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); + + if (!svr.listen_after_bind()) + { + return 1; + } +} From eb75395b5c99256951e82e067a8087bbc19620a0 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Thu, 5 Oct 2023 15:18:47 -0400 Subject: [PATCH 02/19] remove trail whitespace --- examples/server-parallel/server.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 6bd6dadc9d459..2b12fc1004748 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -205,7 +205,7 @@ struct server_parallel_context { { llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system); } - + LOG_TEE("system prompt updated\n"); update_system_prompt = false; } @@ -801,7 +801,7 @@ int main(int argc, char **argv) svr.Get("/", [&](const Request & /*req*/, Response &res) { res.set_content(index_html_, "text/html"); }); - + svr.Get("/index.js", [&](const Request & /*req*/, Response &res) { res.set_content(index_js_, "text/html"); }); @@ -812,6 +812,7 @@ int main(int argc, char **argv) { "assistant_name", llama.assistant_name.c_str() } }; res.set_content(data.dump(), "application/json"); }); + svr.Post("/completion", [&llama](const Request &req, Response &res) { llama_client_slot* slot = llama.requestCompletion(json::parse(req.body)); From afc09db51ccd782500eee374db21f7dbd585b264 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Thu, 5 Oct 2023 15:23:58 -0400 Subject: [PATCH 03/19] fix json format README --- examples/server-parallel/README.md | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/examples/server-parallel/README.md b/examples/server-parallel/README.md index b4181f28be759..5f8b8af8eeffc 100644 --- a/examples/server-parallel/README.md +++ b/examples/server-parallel/README.md @@ -48,16 +48,9 @@ The above command will start a server that by default listens on `127.0.0.1:8080 *Example request:* ```json { - // this changes the system prompt on runtime - "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. - -Human: Hello -Assistant: Hi, how may I help you? -Human:", + "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nHuman: Hello\nAssistant: Hi, how may I help you?\nHuman:", "anti_prompt": "Human:", "assistant_name": "Assistant:", - - // required options "prompt": "When is the day of independency of US?", "temperature": 0.2 } From 5ab6c2132aad2354092a26c096cc5c8f55801141 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Oct 2023 14:32:19 +0300 Subject: [PATCH 04/19] server-parallel : add "--reverse-prompt" + compiler warning fixes --- examples/server-parallel/server.cpp | 40 +++++++++++++++++++---------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 2b12fc1004748..607d173e62af1 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -1,13 +1,15 @@ -#include +#include "frontend.h" +#include "common.h" +#include "llama.h" + #include "../server/httplib.h" #include "../server/json.hpp" + #include #include #include #include -#include "frontend.h" -#include "common.h" -#include "llama.h" +#include using namespace httplib; using namespace std; @@ -241,9 +243,7 @@ struct server_parallel_context { string prompt = data.value("prompt", ""); for (llama_client_slot & slot : slots) { - if ( - slot_id == -1 && slot.available() || - slot.id == slot_id) + if ((slot_id == -1 && slot.available()) || slot.id == slot_id) { slot.start(prompt, temperature); LOG_TEE("slot %i is processing\n", slot.id); @@ -429,8 +429,6 @@ struct server_parallel_context { slot.generated_text += token_str; slot.sampled = id; - size_t pos = 0; - size_t stop_pos = findStoppingStrings(slot.generated_text, token_str.size(), STOP_FULL); @@ -740,20 +738,34 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, else if (arg == "--numa") { params.numa = true; - } else if (arg == "-cb" || arg == "--cont-batching") { + } else if (arg == "-cb" || arg == "--cont-batching") + { params.cont_batching = true; - } else if (arg == "-np" || arg == "--parallel") { - if (++i >= argc) { + } + else if (arg == "-np" || arg == "--parallel") + { + if (++i >= argc) + { invalid_param = true; break; } params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-n" || arg == "--n-predict") { - if (++i >= argc) { + } else if (arg == "-n" || arg == "--n-predict") + { + if (++i >= argc) + { invalid_param = true; break; } params.n_predict = std::stoi(argv[i]); + } else if (arg == "-r" || arg == "--reverse-prompt") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.antiprompt.push_back(argv[i]); } else { From c12e18f2f1110f43f09a00689c880ce034beada4 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Fri, 6 Oct 2023 09:40:08 -0400 Subject: [PATCH 05/19] httplib.h json.hpp -> common lib --- common/CMakeLists.txt | 2 ++ {examples/server => common}/httplib.h | 0 {examples/server => common}/json.hpp | 0 examples/server-parallel/CMakeLists.txt | 2 +- examples/server-parallel/server.cpp | 4 ++-- examples/server/CMakeLists.txt | 2 +- 6 files changed, 6 insertions(+), 4 deletions(-) rename {examples/server => common}/httplib.h (100%) rename {examples/server => common}/json.hpp (100%) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 951aa8340c7e4..2ce3c7c756f85 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -11,6 +11,8 @@ add_library(${TARGET} OBJECT grammar-parser.cpp train.h train.cpp + json.hpp + httplib.h ) if (BUILD_SHARED_LIBS) diff --git a/examples/server/httplib.h b/common/httplib.h similarity index 100% rename from examples/server/httplib.h rename to common/httplib.h diff --git a/examples/server/json.hpp b/common/json.hpp similarity index 100% rename from examples/server/json.hpp rename to common/json.hpp diff --git a/examples/server-parallel/CMakeLists.txt b/examples/server-parallel/CMakeLists.txt index 48c29e9ad18fe..4df8b462fea01 100644 --- a/examples/server-parallel/CMakeLists.txt +++ b/examples/server-parallel/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET server-parallel) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -add_executable(${TARGET} server.cpp ../server/json.hpp ../server/httplib.h) +add_executable(${TARGET} server.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 607d173e62af1..3db0cb7536d4e 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -2,8 +2,8 @@ #include "common.h" #include "llama.h" -#include "../server/httplib.h" -#include "../server/json.hpp" +#include "httplib.h" +#include "json.hpp" #include #include diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 3782f9b80ab82..407cf7b5de253 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,7 +1,7 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -add_executable(${TARGET} server.cpp json.hpp httplib.h) +add_executable(${TARGET} server.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ From c71d933d5bb0c606ad194a858af8f8d63a037505 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Fri, 6 Oct 2023 09:53:36 -0400 Subject: [PATCH 06/19] ci: wrong indent style fixed --- common/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 2ce3c7c756f85..ac7189cd2bc30 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -11,8 +11,8 @@ add_library(${TARGET} OBJECT grammar-parser.cpp train.h train.cpp - json.hpp - httplib.h + json.hpp + httplib.h ) if (BUILD_SHARED_LIBS) From cdceda30c9a8a7eccf033b909b8209cfa98d1d98 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Fri, 6 Oct 2023 10:02:37 -0400 Subject: [PATCH 07/19] added cors middleware --- examples/server-parallel/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 3db0cb7536d4e..09dc336ca01d6 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -811,6 +811,10 @@ int main(int argc, char **argv) Server svr; + svr.set_default_headers({{"Server", "llama.cpp"}, + {"Access-Control-Allow-Origin", "*"}, + {"Access-Control-Allow-Headers", "content-type"}}); + svr.Get("/", [&](const Request & /*req*/, Response &res) { res.set_content(index_html_, "text/html"); }); From f0c646f02325e306f9415bc7476f07088e8e05e5 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Fri, 6 Oct 2023 10:31:14 -0400 Subject: [PATCH 08/19] fix makefile server build --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index b8b0d4b562512..be376c7e29988 100644 --- a/Makefile +++ b/Makefile @@ -517,7 +517,7 @@ OBJS += ggml-alloc.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common.o: common/common.cpp common/common.h build-info.h common/log.h +common.o: common/common.cpp common/common.h build-info.h common/log.h common/httplib.h common/json.hpp $(CXX) $(CXXFLAGS) -c $< -o $@ console.o: common/console.cpp common/console.h @@ -569,7 +569,7 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml. save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) From 6a5d6733fc0d4382c450e5dd2682a9fcfc7cabf6 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Fri, 6 Oct 2023 11:25:58 -0400 Subject: [PATCH 09/19] log sys - build info + rnd seed --- examples/server-parallel/server.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 09dc336ca01d6..901309206ebde 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -1,6 +1,7 @@ #include "frontend.h" #include "common.h" #include "llama.h" +#include "build-info.h" #include "httplib.h" #include "json.hpp" @@ -797,16 +798,31 @@ int main(int argc, char **argv) log_dump_cmdline(argc, argv); #endif // LOG_DISABLE_LOGS + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + LOG_TEE("%s: seed = %u\n", __func__, params.seed); + llama_backend_init(params.numa); // load the target model params.logits_all = true; server_parallel_context llama; + LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); + if(!llama.loadModel(params)) { return 1; } + // print system information + { + LOG_TEE("\n"); + LOG_TEE("%s\n", get_system_info(params).c_str()); + } + llama.initialize(); Server svr; From 2fdc181dcb5e7ea9da1257f5f4bb5a2f4be9cf41 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Fri, 6 Oct 2023 11:46:51 -0400 Subject: [PATCH 10/19] example added to makefile --- Makefile | 5 ++++- examples/server-parallel/server.cpp | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index be376c7e29988..6c0c3a952e218 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server server-parallel embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe @@ -572,6 +572,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml. server: examples/server/server.cpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) +server-parallel: examples/server-parallel/server.cpp examples/server-parallel/frontend.h build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) -Iexamples/server-parallel $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) + $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 901309206ebde..602bb921781ff 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -501,8 +501,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, // new arguments printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); - printf(" -f FNAME, --file FNAME\n"); - printf(" load a system prompt from a file.\n"); + printf(" -r ANTI_PROMPT, --reverse-prompt ANTI_PROMPT\n"); + printf(" set a anti prompt, used as user name in prompt generation\n"); printf("\n"); } From c1ac53fbdb465f1a35e60c4e4069cf6e3f4011fc Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Fri, 6 Oct 2023 14:18:03 -0400 Subject: [PATCH 11/19] improve README + more questions --- examples/server-parallel/README.md | 25 +++++++++++++++++++++++++ examples/server-parallel/frontend.h | 5 +++++ examples/server-parallel/server.cpp | 2 +- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/examples/server-parallel/README.md b/examples/server-parallel/README.md index 5f8b8af8eeffc..0244917597d6e 100644 --- a/examples/server-parallel/README.md +++ b/examples/server-parallel/README.md @@ -2,6 +2,31 @@ This example demonstrates a PoC HTTP API server that handles simulataneus requests. Long prompts are not supported. +Command line options: + +- `--threads N`, `-t N`: Set the number of threads to use during generation. +- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. +- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). +- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. +- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. +- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. +- `--numa`: Attempt optimizations that help on some NUMA systems. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. +- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. +- `--port`: Set the port to listen. Default: `8080`. +- `--path`: path from which to serve static files (default examples/server/public) +- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1) +- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) +- `-r ANTI_PROMPT`, `--reverse-prompt ANTI_PROMPT`: Set a anti prompt, used as user name in prompt generation + ## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h index e3656ea218da0..b6909c0ff821e 100644 --- a/examples/server-parallel/frontend.h +++ b/examples/server-parallel/frontend.h @@ -61,6 +61,11 @@ const questions = [ "How to get a job at google?", "What are you?", "When was born Abraham Lincoln?", + "What is a black hole?", + "How to prepare for an interview?", + "Will you destroy the humanity?", + "Who is smarter, you or I?", + "What is quantization in ML?" ]; let user_name = ""; diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 602bb921781ff..72ec7c2bc85e6 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -499,7 +499,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); // new arguments - printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); + printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" -r ANTI_PROMPT, --reverse-prompt ANTI_PROMPT\n"); printf(" set a anti prompt, used as user name in prompt generation\n"); From a8435c3e32a8ce29844c1db2cc80cb5314ca517f Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Fri, 6 Oct 2023 18:22:07 -0400 Subject: [PATCH 12/19] improved token gen logic and limits --- examples/server-parallel/server.cpp | 70 ++++++++++++++--------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 72ec7c2bc85e6..dbc361fd31594 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -80,36 +80,33 @@ struct llama_client_slot int32_t n_decoded = 0; int32_t i_batch = -1; string prompt = ""; - string sampled_token_str; string generated_text = ""; + int n_tokens_predicted = 0; llama_token sampled; + std::vector sampled_tokens; std::vector tokens_prev; slot_state state = IDLE; slot_command command = NONE; - bool newToken = false; float temperature = 0.1f; void start(string prompt_, float temp_) { prompt = prompt_; command = LOAD_PROMPT; temperature = temp_; - newToken = false; + LOG_TEE("slot %i is processing\n", id); } bool hasNewToken() { - if(newToken) { - newToken = false; - return true; - } - return false; + return sampled_tokens.size() > 0; } bool available() { return state == IDLE && command == NONE; } - void nofity() { - newToken = !newToken; + void addTokenString(string token) { + sampled_tokens.insert(sampled_tokens.begin(), token); + n_tokens_predicted++; } void release() { @@ -163,7 +160,7 @@ struct server_parallel_context { slot.id = i; slot.prompt = "default"; slot.state = IDLE; - slot.tokens_prev.resize(std::max(256, params.n_predict)); + slot.tokens_prev.resize(params.n_predict); std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0); LOG_TEE(" - slot %i\n", slot.id); slots.push_back(slot); @@ -247,7 +244,6 @@ struct server_parallel_context { if ((slot_id == -1 && slot.available()) || slot.id == slot_id) { slot.start(prompt, temperature); - LOG_TEE("slot %i is processing\n", slot.id); return &slot; // return a pointer to slot (thread safe?) } } @@ -302,6 +298,7 @@ struct server_parallel_context { } batch.n_tokens = 0; + int kv_cache_free = (n_ctx - n_tokens_system); // decode any currently ongoing sequences for (auto & slot : slots) { @@ -311,13 +308,17 @@ struct server_parallel_context { llama_kv_cache_seq_rm(ctx, slot.id, n_tokens_system, n_ctx); slot.state = IDLE; slot.command = NONE; + slot.n_prompt = 0; + slot.n_tokens_predicted = 0; continue; } + kv_cache_free -= slot.n_prompt; + // no decode wait until the token had been send to client // improves performance and avoid decoherence? - if (slot.state == IDLE || slot.newToken) { + if (slot.state == IDLE) { continue; } @@ -339,12 +340,13 @@ struct server_parallel_context { if (slot.state == IDLE && slot.command == LOAD_PROMPT) { slot.state = PROCESSING; slot.command = NONE; - //LOG_TEE("slot %i process prompt:\n%s%s'------------------------------\n", slot.id, system_prompt.c_str(), slot.prompt.c_str()); + std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0); // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; tokens_prompt = ::llama_tokenize(ctx, slot.prompt, false); + slot.n_tokens_predicted = 0; for (size_t i = 0; i < tokens_prompt.size(); ++i) { batch.token [batch.n_tokens] = tokens_prompt[i]; @@ -362,11 +364,6 @@ struct server_parallel_context { slot.n_prompt = tokens_prompt.size(); slot.n_decoded = 0; slot.i_batch = batch.n_tokens - 1; - - // insert new requests one-by-one - //if (cont_batching) { - // break; - //} } } } @@ -379,13 +376,6 @@ struct server_parallel_context { int32_t n_batch = params.n_batch; for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { - // experiment: process in powers of 2 - //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) { - // n_batch /= 2; - // i -= n_batch; - // continue; - //} - const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); llama_batch batch_view = { @@ -431,18 +421,17 @@ struct server_parallel_context { slot.sampled = id; size_t stop_pos = - findStoppingStrings(slot.generated_text, token_str.size(), STOP_FULL); + findStoppingStrings(slot.generated_text, token_str.size(), STOP_FULL); - slot.sampled_token_str = token_str; - // notify new token - slot.nofity(); + slot.addTokenString(token_str); + + kv_cache_free -= slot.n_tokens_predicted; if (slot.n_decoded > 2 && - (id == llama_token_eos(ctx) || - (params.n_predict > 0 && - slot.n_decoded + slot.n_prompt >= - params.n_predict) || - stop_pos != std::string::npos)) { + (id == llama_token_eos(ctx) || + (slot.n_decoded + slot.n_prompt >= + params.n_predict) || + stop_pos != std::string::npos)) { //LOG_TEE("slot %i generated text:\n%s'------------------------------\n", slot.id, slot.generated_text.c_str()); slot.generated_text.clear(); slot.release(); @@ -450,6 +439,11 @@ struct server_parallel_context { slot.i_batch = -1; } } + + if(kv_cache_free < 0) { + LOG_TEE("\nError: kv cache is full, increase context size."); + return false; + } return true; } }; @@ -759,6 +753,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } params.n_predict = std::stoi(argv[i]); + if(params.n_predict <= 128) { // this example don't support long prompts + params.n_predict = 128; + } } else if (arg == "-r" || arg == "--reverse-prompt") { if (++i >= argc) @@ -858,7 +855,8 @@ int main(int argc, char **argv) } if(slot->hasNewToken()) { // new token notification stringstream ss; - json res_d = {{ "content", slot->sampled_token_str }}; + json res_d = {{ "content", slot->sampled_tokens.back() }}; + slot->sampled_tokens.pop_back(); ss << "data: " << res_d.dump() << "\n\n"; string result = ss.str(); if(!sink.write(result.c_str(), result.size())) { From f861ff916d573dc701fcda25e1d13044ed37a323 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 8 Oct 2023 13:54:54 +0300 Subject: [PATCH 13/19] gitignore : server-parallel --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 420e0d6d016a2..f98acd707b422 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ models-mnt /result /save-load-state /server +/server-parallel /simple /batched /export-lora From 8a8535bb6dad549f5fec276e768a3e91c5d14a57 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Sun, 8 Oct 2023 22:30:43 -0400 Subject: [PATCH 14/19] fix cors + regen + cancel funcs --- examples/server-parallel/frontend.h | 120 +++++++++++++++++++--------- examples/server-parallel/server.cpp | 45 ++++++++++- 2 files changed, 122 insertions(+), 43 deletions(-) diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h index b6909c0ff821e..8fdc0fb09bb97 100644 --- a/examples/server-parallel/frontend.h +++ b/examples/server-parallel/frontend.h @@ -11,14 +11,14 @@ const char* index_html_ = R"( - llama.cpp - server parallel PoC + llama.cpp - server parallel

-

Server parallel - PoC

+

Server parallel

- +

@@ -39,10 +39,9 @@ const char* index_html_ = R"(

- -
-
- + + +
@@ -52,8 +51,12 @@ const char* index_html_ = R"( )"; const char* index_js_ = R"( - let conversation = []; - let current_message = -1; +let conversation = []; +let current_message = -1; +let request_cancel = false; +let canceled = false; +let running = false; +let slot_id = -1; const questions = [ "Who is Elon Musk?", @@ -71,7 +74,7 @@ const questions = [ let user_name = ""; let assistant_name = ""; -function toggleSP() { +function toggle_system_prompt() { if(document.getElementById("system_promt_cb").checked) { document.getElementById("system_prompt_view").style.display = "block"; } else { @@ -79,16 +82,15 @@ function toggleSP() { } } -function clearSP() { +function clear_sp_props() { document.getElementById("sp_text").value = ""; - document.getElementById("anti_prompt").value = ""; + document.getElementById("user_name").value = ""; document.getElementById("assistant_name").value = ""; } docReady(async () => { document.getElementById("message").value = questions[Math.floor(Math.random() * questions.length)]; - // to keep the same prompt format in all clients const response = await fetch("/props"); if (!response.ok) { @@ -128,28 +130,31 @@ function updateView() { } async function call_llama(options) { - const response = await fetch("/completion", { - method: "POST", - body: JSON.stringify(options), - headers: { - Connection: "keep-alive", - "Content-Type": "application/json", - Accept: "text/event-stream", - }, - }); - - const reader = response.body.getReader(); - let cont = true; - const decoder = new TextDecoder(); - let leftover = ""; // Buffer for partially read lines - try { - let cont = true; - - while (cont) { + controller = new AbortController(); + const response = await fetch("/completion", { + method: "POST", + body: JSON.stringify(options), + headers: { + "Content-Type": "application/json", + Accept: "text/event-stream", + } + }); + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let leftover = ""; // Buffer for partially read lines + running = true; + while (running) { + // this no disposes the slot + if(request_cancel) { + running = false; + break; + } const result = await reader.read(); if (result.done) { document.getElementById("btn_send").disabled = false; + document.getElementById("btn_cancel").disabled = true; + running = false; break; } @@ -177,8 +182,9 @@ async function call_llama(options) { if (match) { result[match[1]] = match[2]; // since we know this is llama.cpp, let's just decode the json in data - if (result.data) { + if (result.data && !request_cancel) { result.data = JSON.parse(result.data); + slot_id = result.data.slot_id; conversation[current_message].assistant += result.data.content; updateView(); } @@ -211,21 +217,56 @@ function generatePrompt() { return prompt; } -function resetBtn() { +async function resetView() { + if(running) { + await sendCancelSignal(); + } document.getElementById("slot_id").value = "-1"; document.getElementById("temperature").value = "0.1"; document.getElementById("message").value = questions[Math.floor(Math.random() * questions.length)]; + document.getElementById("btn_cancel").disabled = true; + document.getElementById("btn_cancel").innerText = "Cancel"; + document.getElementById("btn_send").disabled = false; document.getElementById("conversation_view").innerHTML = ""; conversation = []; current_message = -1; + canceled = false; +} + +async function sendCancelSignal() { + await fetch( + "/cancel?slot_id=" + slot_id + ); + request_cancel = true; +} + +async function cancel() { + if(!canceled) { + await sendCancelSignal(); + document.getElementById("btn_send").disabled = false; + document.getElementById("btn_cancel").innerText = "Regenerate response"; + canceled = true; + } else { + perform(true); + } } -async function perform() { +async function perform(regen) { + if(regen) { + document.getElementById("message").value = conversation.pop().user; + current_message--; + } + request_cancel = false; var slot_id = parseInt(document.getElementById("slot_id").value); var temperature = parseFloat(document.getElementById("temperature").value); var prompt = " " + document.getElementById("message").value; - if (!isNaN(slot_id) && !isNaN(temperature) && prompt.length > 0) { + if (prompt.length > 1 && !isNaN(slot_id) && !isNaN(temperature)) { + if(!regen && canceled) { // use the new message + conversation.pop(); // delete incomplete interaction + current_message--; + } + canceled = false; let options = { slot_id, temperature @@ -243,6 +284,7 @@ async function perform() { current_message = -1; document.getElementById("system_promt_cb").checked = false; document.getElementById("system_promt_cb").dispatchEvent(new Event("change")); + // include system prompt props options.system_prompt = system_prompt; options.anti_prompt = anti_prompt; options.assistant_name = assistant_name_; @@ -257,12 +299,12 @@ async function perform() { updateView(); document.getElementById("message").value = ""; document.getElementById("btn_send").disabled = true; + document.getElementById("btn_cancel").disabled = false; + document.getElementById("btn_cancel").innerText = "Cancel"; options.prompt = generatePrompt(); await call_llama(options); } else { - document.getElementById("conversation_view").innerText = - "please, insert valid props."; + alert("please, insert valid props."); } } - )"; diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index dbc361fd31594..431211621fbde 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -210,6 +210,16 @@ struct server_parallel_context { update_system_prompt = false; } + bool releaseSlot(int id) { + for(llama_client_slot &slot : slots) { + if(slot.id == id) { + slot.release(); + return true; + } + } + return false; + } + void notifySystemPromptChanged() { // release all slots for (llama_client_slot &slot : slots) @@ -824,9 +834,12 @@ int main(int argc, char **argv) Server svr; - svr.set_default_headers({{"Server", "llama.cpp"}, - {"Access-Control-Allow-Origin", "*"}, - {"Access-Control-Allow-Headers", "content-type"}}); + svr.Options("/(.*)", + [&](const Request & /*req*/, Response &res) { + res.set_header("Access-Control-Allow-Methods", "*"); + res.set_header("Access-Control-Allow-Headers", "content-type"); + res.set_header("Access-Control-Allow-Origin", "*"); + }); svr.Get("/", [&](const Request & /*req*/, Response &res) { res.set_content(index_html_, "text/html"); }); @@ -836,14 +849,36 @@ int main(int argc, char **argv) svr.Get("/props", [&llama](const Request & /*req*/, Response &res) { + res.set_header("Access-Control-Allow-Origin", "*"); json data = { { "user_name", llama.user_name.c_str() }, { "assistant_name", llama.assistant_name.c_str() } }; res.set_content(data.dump(), "application/json"); }); + svr.Get("/cancel", [&llama](const Request & req/*req*/, Response &res) { + res.set_header("Access-Control-Allow-Origin", "*"); + if(req.has_param("slot_id")) { + int slot_id = std::stoi(req.get_param_value("slot_id")); + string result = "done"; + if(!llama.releaseSlot(slot_id)) { + result = "wrong slot ID"; + } + json data = { + { "status", result } + }; + res.set_content(data.dump(), "application/json"); + } else { + json data = { + { "error", "Missing parameter" } + }; + res.set_content(data.dump(), "application/json"); + } + }); + svr.Post("/completion", [&llama](const Request &req, Response &res) { + res.set_header("Access-Control-Allow-Origin", "*"); llama_client_slot* slot = llama.requestCompletion(json::parse(req.body)); // Verify if the slot exist if (slot) { @@ -855,7 +890,9 @@ int main(int argc, char **argv) } if(slot->hasNewToken()) { // new token notification stringstream ss; - json res_d = {{ "content", slot->sampled_tokens.back() }}; + json res_d = { + { "content", slot->sampled_tokens.back() }, + { "slot_id", slot->id }}; slot->sampled_tokens.pop_back(); ss << "data: " << res_d.dump() << "\n\n"; string result = ss.str(); From c8d7b1b8971d9c67c7c1829348f2f1c89b2929c3 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Sun, 8 Oct 2023 22:36:16 -0400 Subject: [PATCH 15/19] remove useless line --- examples/server-parallel/frontend.h | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h index 8fdc0fb09bb97..23d439677d0f7 100644 --- a/examples/server-parallel/frontend.h +++ b/examples/server-parallel/frontend.h @@ -131,7 +131,6 @@ function updateView() { async function call_llama(options) { try { - controller = new AbortController(); const response = await fetch("/completion", { method: "POST", body: JSON.stringify(options), From 59e7c0c51b4a2c93f8f1be5a0ef64f4d4c0554f5 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Mon, 9 Oct 2023 07:53:00 -0400 Subject: [PATCH 16/19] fixed cancel + removed useless code --- examples/server-parallel/frontend.h | 43 ++++++++++++----------------- examples/server-parallel/server.cpp | 40 +++++---------------------- 2 files changed, 24 insertions(+), 59 deletions(-) diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h index 23d439677d0f7..abe9eb701d2bc 100644 --- a/examples/server-parallel/frontend.h +++ b/examples/server-parallel/frontend.h @@ -53,10 +53,9 @@ const char* index_html_ = R"( const char* index_js_ = R"( let conversation = []; let current_message = -1; -let request_cancel = false; let canceled = false; -let running = false; let slot_id = -1; +var controller; const questions = [ "Who is Elon Musk?", @@ -92,7 +91,7 @@ docReady(async () => { document.getElementById("message").value = questions[Math.floor(Math.random() * questions.length)]; // to keep the same prompt format in all clients - const response = await fetch("/props"); + const response = await fetch("http://localhost:8080/props"); if (!response.ok) { alert(`HTTP error! Status: ${response.status}`); } @@ -131,29 +130,25 @@ function updateView() { async function call_llama(options) { try { - const response = await fetch("/completion", { + controller = new AbortController(); + signal = controller.signal; + const response = await fetch("http://localhost:8080/completion", { method: "POST", body: JSON.stringify(options), headers: { "Content-Type": "application/json", Accept: "text/event-stream", - } + }, + signal: signal }); const reader = response.body.getReader(); const decoder = new TextDecoder(); let leftover = ""; // Buffer for partially read lines - running = true; - while (running) { - // this no disposes the slot - if(request_cancel) { - running = false; - break; - } + while (current_message >= 0) { const result = await reader.read(); if (result.done) { document.getElementById("btn_send").disabled = false; document.getElementById("btn_cancel").disabled = true; - running = false; break; } @@ -181,7 +176,7 @@ async function call_llama(options) { if (match) { result[match[1]] = match[2]; // since we know this is llama.cpp, let's just decode the json in data - if (result.data && !request_cancel) { + if (result.data && current_message >= 0) { result.data = JSON.parse(result.data); slot_id = result.data.slot_id; conversation[current_message].assistant += result.data.content; @@ -194,7 +189,6 @@ async function call_llama(options) { if (e.name !== "AbortError") { console.error("llama error: ", e); } - throw e; } } @@ -213,12 +207,14 @@ function generatePrompt() { prompt += assistant_name + conversation[index].assistant; } } + console.log(prompt) return prompt; } async function resetView() { - if(running) { - await sendCancelSignal(); + if(controller) { + controller.abort(); + controller = null; } document.getElementById("slot_id").value = "-1"; document.getElementById("temperature").value = "0.1"; @@ -233,16 +229,12 @@ async function resetView() { canceled = false; } -async function sendCancelSignal() { - await fetch( - "/cancel?slot_id=" + slot_id - ); - request_cancel = true; -} - async function cancel() { if(!canceled) { - await sendCancelSignal(); + if(controller) { + controller.abort(); + controller = null; + } document.getElementById("btn_send").disabled = false; document.getElementById("btn_cancel").innerText = "Regenerate response"; canceled = true; @@ -256,7 +248,6 @@ async function perform(regen) { document.getElementById("message").value = conversation.pop().user; current_message--; } - request_cancel = false; var slot_id = parseInt(document.getElementById("slot_id").value); var temperature = parseFloat(document.getElementById("temperature").value); var prompt = " " + document.getElementById("message").value; diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 431211621fbde..13dd1fcc07b7b 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -210,16 +210,6 @@ struct server_parallel_context { update_system_prompt = false; } - bool releaseSlot(int id) { - for(llama_client_slot &slot : slots) { - if(slot.id == id) { - slot.release(); - return true; - } - } - return false; - } - void notifySystemPromptChanged() { // release all slots for (llama_client_slot &slot : slots) @@ -357,6 +347,7 @@ struct server_parallel_context { std::vector tokens_prompt; tokens_prompt = ::llama_tokenize(ctx, slot.prompt, false); slot.n_tokens_predicted = 0; + slot.sampled_tokens.clear(); for (size_t i = 0; i < tokens_prompt.size(); ++i) { batch.token [batch.n_tokens] = tokens_prompt[i]; @@ -856,34 +847,13 @@ int main(int argc, char **argv) }; res.set_content(data.dump(), "application/json"); }); - svr.Get("/cancel", [&llama](const Request & req/*req*/, Response &res) { - res.set_header("Access-Control-Allow-Origin", "*"); - if(req.has_param("slot_id")) { - int slot_id = std::stoi(req.get_param_value("slot_id")); - string result = "done"; - if(!llama.releaseSlot(slot_id)) { - result = "wrong slot ID"; - } - json data = { - { "status", result } - }; - res.set_content(data.dump(), "application/json"); - } else { - json data = { - { "error", "Missing parameter" } - }; - res.set_content(data.dump(), "application/json"); - } - }); - svr.Post("/completion", [&llama](const Request &req, Response &res) { res.set_header("Access-Control-Allow-Origin", "*"); llama_client_slot* slot = llama.requestCompletion(json::parse(req.body)); // Verify if the slot exist if (slot) { - res.set_chunked_content_provider("text/event-stream", - [slot](size_t /*offset*/, DataSink &sink) { + auto content_provider = [slot](size_t /*offset*/, DataSink &sink) { if(slot->available()) { // slot has been released sink.done(); return false; @@ -902,7 +872,11 @@ int main(int argc, char **argv) } } return true; - }); + }; + auto on_complete = [slot] (bool) { + slot->release(); + }; + res.set_chunked_content_provider("text/event-stream", content_provider, on_complete); } else { LOG_TEE("slot unavailable\n"); res.status = 404; From 8d3681ddbe7690317880760d971d2bfe586177c3 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Mon, 9 Oct 2023 08:00:59 -0400 Subject: [PATCH 17/19] refactored some issues --- examples/server-parallel/frontend.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h index abe9eb701d2bc..7c2b8ca7cb48f 100644 --- a/examples/server-parallel/frontend.h +++ b/examples/server-parallel/frontend.h @@ -91,7 +91,7 @@ docReady(async () => { document.getElementById("message").value = questions[Math.floor(Math.random() * questions.length)]; // to keep the same prompt format in all clients - const response = await fetch("http://localhost:8080/props"); + const response = await fetch("/props"); if (!response.ok) { alert(`HTTP error! Status: ${response.status}`); } @@ -132,7 +132,7 @@ async function call_llama(options) { try { controller = new AbortController(); signal = controller.signal; - const response = await fetch("http://localhost:8080/completion", { + const response = await fetch("/completion", { method: "POST", body: JSON.stringify(options), headers: { @@ -207,7 +207,6 @@ function generatePrompt() { prompt += assistant_name + conversation[index].assistant; } } - console.log(prompt) return prompt; } From 6a2e064d684637cb7fe5049546619608c9ffcc89 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Mon, 9 Oct 2023 08:14:42 -0400 Subject: [PATCH 18/19] fix unexpected behavior when multiple requests are canceled --- examples/server-parallel/frontend.h | 3 +-- examples/server-parallel/server.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h index 7c2b8ca7cb48f..122d58b44556c 100644 --- a/examples/server-parallel/frontend.h +++ b/examples/server-parallel/frontend.h @@ -131,7 +131,6 @@ function updateView() { async function call_llama(options) { try { controller = new AbortController(); - signal = controller.signal; const response = await fetch("/completion", { method: "POST", body: JSON.stringify(options), @@ -139,7 +138,7 @@ async function call_llama(options) { "Content-Type": "application/json", Accept: "text/event-stream", }, - signal: signal + signal: controller.signal }); const reader = response.body.getReader(); const decoder = new TextDecoder(); diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 13dd1fcc07b7b..0d98db4de30f6 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -310,6 +310,7 @@ struct server_parallel_context { slot.command = NONE; slot.n_prompt = 0; slot.n_tokens_predicted = 0; + slot.sampled_tokens.clear(); continue; } @@ -346,8 +347,6 @@ struct server_parallel_context { // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; tokens_prompt = ::llama_tokenize(ctx, slot.prompt, false); - slot.n_tokens_predicted = 0; - slot.sampled_tokens.clear(); for (size_t i = 0; i < tokens_prompt.size(); ++i) { batch.token [batch.n_tokens] = tokens_prompt[i]; From e86a7d2ebdfc361e6ffb3b838f2bf716bd67aa88 Mon Sep 17 00:00:00 2001 From: FSSRepo Date: Wed, 11 Oct 2023 13:03:35 -0400 Subject: [PATCH 19/19] avoid 100% cpu usage all time --- examples/server-parallel/server.cpp | 34 +++++++++++++++++++---------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 0d98db4de30f6..b98477856d3af 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -105,6 +105,10 @@ struct llama_client_slot } void addTokenString(string token) { + if(command == RELEASE) { + sampled_tokens.clear(); + return; + } sampled_tokens.insert(sampled_tokens.begin(), token); n_tokens_predicted++; } @@ -135,6 +139,7 @@ struct server_parallel_context { std::vector candidates; std::vector tokens_system; int32_t n_tokens_system = 0; + bool all_slots_are_idle = false; llama_batch batch; bool loadModel(gpt_params params_) { @@ -172,6 +177,7 @@ struct server_parallel_context { user_name = "User:"; assistant_name = "Assistant:"; params.antiprompt.push_back(user_name); + all_slots_are_idle = true; } void updateSystemPrompt() { @@ -217,6 +223,7 @@ struct server_parallel_context { slot.release(); } waitAllAreIdle(); + all_slots_are_idle = true; // wait until system prompt load update_system_prompt = true; while(update_system_prompt) { @@ -244,6 +251,7 @@ struct server_parallel_context { if ((slot_id == -1 && slot.available()) || slot.id == slot_id) { slot.start(prompt, temperature); + all_slots_are_idle = false; return &slot; // return a pointer to slot (thread safe?) } } @@ -292,6 +300,7 @@ struct server_parallel_context { } bool updateSlots() { + // update the system prompt wait until all slots are idle state if(update_system_prompt) { updateSystemPrompt(); @@ -299,10 +308,13 @@ struct server_parallel_context { batch.n_tokens = 0; int kv_cache_free = (n_ctx - n_tokens_system); - + if(all_slots_are_idle) { + // avoid 100% usage of cpu all time + this_thread::sleep_for(chrono::milliseconds(5)); + } // decode any currently ongoing sequences for (auto & slot : slots) { - if (slot.state == PROCESSING && slot.command == RELEASE) + if (slot.state == PROCESSING && slot.command == RELEASE && !slot.hasNewToken()) { LOG_TEE("slot %i released\n", slot.id); llama_kv_cache_seq_rm(ctx, slot.id, n_tokens_system, n_ctx); @@ -310,16 +322,12 @@ struct server_parallel_context { slot.command = NONE; slot.n_prompt = 0; slot.n_tokens_predicted = 0; - slot.sampled_tokens.clear(); continue; } kv_cache_free -= slot.n_prompt; - // no decode wait until the token had been send to client - // improves performance and avoid decoherence? - - if (slot.state == IDLE) { + if (slot.state == IDLE || slot.command == RELEASE) { continue; } @@ -369,6 +377,7 @@ struct server_parallel_context { } if (batch.n_tokens == 0) { + all_slots_are_idle = true; return true; } @@ -853,10 +862,6 @@ int main(int argc, char **argv) // Verify if the slot exist if (slot) { auto content_provider = [slot](size_t /*offset*/, DataSink &sink) { - if(slot->available()) { // slot has been released - sink.done(); - return false; - } if(slot->hasNewToken()) { // new token notification stringstream ss; json res_d = { @@ -869,10 +874,17 @@ int main(int argc, char **argv) slot->release(); return false; } + } else { + this_thread::sleep_for(chrono::milliseconds(5)); + } + if(slot->available()) { // slot has been released + sink.done(); + return false; } return true; }; auto on_complete = [slot] (bool) { + slot->sampled_tokens.clear(); slot->release(); }; res.set_chunked_content_provider("text/event-stream", content_provider, on_complete);