From 68d1d8fe28e9887c7e3eff4f714d3faa7def3081 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 8 Mar 2024 13:16:16 +0100 Subject: [PATCH 01/14] server: bench: Init a bench scenario with K6 See #5827 --- examples/server/bench/README.md | 64 +++++++++++++++++++++++++ examples/server/bench/script.js | 84 +++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 examples/server/bench/README.md create mode 100644 examples/server/bench/script.js diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md new file mode 100644 index 0000000000000..049d233174cf8 --- /dev/null +++ b/examples/server/bench/README.md @@ -0,0 +1,64 @@ +### Server benchmark tools + +Benchmark is using [k6](https://k6.io/). + +##### Install k6 - ubuntu +```shell +snap install k6 +``` + +#### Downloading the ShareGPT dataset + +```shell +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +#### Download a model +Example for PHI-2 + +```shell +../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf +``` + +#### Start the server +The server must listen on `localhost:8080`. + +Example: +```shell +server --host localhost --port 8080 \ + --model ggml-model-q4_0.gguf \ + --cont-batching \ + --metrics \ + --parallel 8 \ + --batch-size 512 \ + --ctx-size 4096 \ + --log-format text \ + -ngl 33 +``` + +#### Run the bench +```shell +k6 run script.js +``` + +#### Change the number of concurrent user +in the `script.js`, change the ramping period according to your number of slots. + +#### Metrics + +Following metrics are available: +- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens` +- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens` +- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens` +- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens` +- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second +- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'` +- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'` + +The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`. + +K6 metrics might be compared against [server metrics](../README.md), with: + +```shell +curl http://localhost:8080/metrics +``` \ No newline at end of file diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js new file mode 100644 index 0000000000000..9d963e49de08b --- /dev/null +++ b/examples/server/bench/script.js @@ -0,0 +1,84 @@ +import http from 'k6/http'; +import { check, sleep } from 'k6'; +import { SharedArray } from 'k6/data'; +import { Counter, Gauge, Rate } from 'k6/metrics'; + +const data = new SharedArray('conversations', function () { + return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json')) + + // Filter out the conversations with less than 2 turns. + .filter(data => data["conversations"].length >= 2) + // Only keep the first two turns of each conversation. + .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"])); +}); + +const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens'); +const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens'); + +const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds'); + +const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter'); +const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter'); + +const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate'); +const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'); + +export const options = { + thresholds: { + llamacpp_completions_truncated_rate: [ + // more than 10% of truncated input will abort the test + { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' }, + ], + }, + scenarios: { + completions: { + executor: 'ramping-vus', + startVUs: 1, + stages: [ + {duration: '1m', target: 8}, + {duration: '3m', target: 8}, + {duration: '1m', target: 0}, + ], + gracefulRampDown: '30s', + }, + }, +}; + +export default function () { + const conversation = data[0] + const payload = { + "messages": [ + { + "role": "system", + "content": conversation[0], + }, + { + "role": "user", + "content": conversation[1], + } + ], + "model": "model", + "stream": false, + } + let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), { + headers: { 'Content-Type': 'application/json' }, + }) + + check(res, {'success completion': (r) => r.status === 200}) + + const completions = res.json() + + llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) + + llamacpp_completion_tokens.add(completions.usage.completion_tokens) + llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) + + llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3) + + llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') + llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + + + sleep(0.3) +} \ No newline at end of file From 0b822b6a0f3cfaaef26a79a8f3134df1749b948a Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 8 Mar 2024 19:49:49 +0100 Subject: [PATCH 02/14] server: bench: EOL EOF --- examples/server/bench/README.md | 2 +- examples/server/bench/script.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 049d233174cf8..b8edc8587e04a 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -61,4 +61,4 @@ K6 metrics might be compared against [server metrics](../README.md), with: ```shell curl http://localhost:8080/metrics -``` \ No newline at end of file +``` diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index 9d963e49de08b..c52eb182a885a 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -81,4 +81,4 @@ export default function () { sleep(0.3) -} \ No newline at end of file +} From 548bc9635a8326406f6a0382731902e132a3532d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 00:13:54 +0100 Subject: [PATCH 03/14] server: bench: PR feedback and improved k6 script configuration --- examples/server/bench/README.md | 27 ++++++++--- examples/server/bench/script.js | 81 +++++++++++++++++++-------------- 2 files changed, 69 insertions(+), 39 deletions(-) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index b8edc8587e04a..67367b8101745 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -2,12 +2,18 @@ Benchmark is using [k6](https://k6.io/). -##### Install k6 - ubuntu +##### Install k6 + +Follow instruction from: https://k6.io/docs/get-started/installation/ + +Example for ubuntu: ```shell snap install k6 ``` -#### Downloading the ShareGPT dataset +#### Download a dataset + +This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md). ```shell wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json @@ -21,7 +27,7 @@ Example for PHI-2 ``` #### Start the server -The server must listen on `localhost:8080`. +The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`. Example: ```shell @@ -36,13 +42,22 @@ server --host localhost --port 8080 \ -ngl 33 ``` -#### Run the bench +#### Run the benchmark + ```shell k6 run script.js ``` -#### Change the number of concurrent user -in the `script.js`, change the ramping period according to your number of slots. +The benchmark values can be overridden with: +- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1` +- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480` +- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` + +Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): + +```shell +SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8 +``` #### Metrics diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index c52eb182a885a..e2068fb928c14 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -1,51 +1,58 @@ -import http from 'k6/http'; -import { check, sleep } from 'k6'; -import { SharedArray } from 'k6/data'; -import { Counter, Gauge, Rate } from 'k6/metrics'; +import http from 'k6/http' +import {check, sleep} from 'k6' +import {SharedArray} from 'k6/data' +import {Counter, Gauge, Rate} from 'k6/metrics' -const data = new SharedArray('conversations', function () { - return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json')) +// Server chat completions prefix +const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1' + +// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users +const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8 + +// Model name to request +const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model' + +// Dataset path +const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json' +export function setup() { + console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`) +} + +const data = new SharedArray('conversations', function () { + return JSON.parse(open(dataset_path)) // Filter out the conversations with less than 2 turns. .filter(data => data["conversations"].length >= 2) // Only keep the first two turns of each conversation. - .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"])); -}); + .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"])) + // Keep only first n prompts + .slice(0, n_prompt) +}) -const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens'); -const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens'); +const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens') +const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens') -const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds'); +const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds') -const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter'); -const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter'); +const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') +const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') -const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate'); -const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'); +const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate') +const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate') export const options = { thresholds: { llamacpp_completions_truncated_rate: [ // more than 10% of truncated input will abort the test - { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' }, + {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'}, ], }, - scenarios: { - completions: { - executor: 'ramping-vus', - startVUs: 1, - stages: [ - {duration: '1m', target: 8}, - {duration: '3m', target: 8}, - {duration: '1m', target: 0}, - ], - gracefulRampDown: '30s', - }, - }, -}; + duration: '10m', + vus: 8, +} export default function () { - const conversation = data[0] + const conversation = data[Math.floor(Math.random() * data.length)] const payload = { "messages": [ { @@ -57,15 +64,23 @@ export default function () { "content": conversation[1], } ], - "model": "model", + "model": model, "stream": false, } - let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), { - headers: { 'Content-Type': 'application/json' }, + + const body = JSON.stringify(payload) + + console.debug(`request: ${body}`) + + let res = http.post(`${server_url}/chat/completions`, body, { + headers: {'Content-Type': 'application/json'}, + timeout: '300s' }) check(res, {'success completion': (r) => r.status === 200}) + console.debug(`response: ${res.body}`) + const completions = res.json() llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) From ab0a59d6d32e4a90a9c8099bbb73dea2a225e9bc Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 01:09:56 +0100 Subject: [PATCH 04/14] server: bench: remove llamacpp_completions_tokens_seconds as it include prompt processing time and it's misleading server: bench: add max_tokens from SERVER_BENCH_MAX_TOKENS server: bench: increase truncated rate to 80% before failing --- examples/server/bench/README.md | 5 +++-- examples/server/bench/script.js | 35 ++++++++++++++++++--------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 67367b8101745..0c8f6b5161c34 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -44,14 +44,16 @@ server --host localhost --port 8080 \ #### Run the benchmark +For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run: ```shell -k6 run script.js +k6 run script.js --duration 10m --iterations 500 --vus 8 ``` The benchmark values can be overridden with: - `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1` - `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480` - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` +- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024` Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): @@ -66,7 +68,6 @@ Following metrics are available: - `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens` - `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens` - `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens` -- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'` - `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'` diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index e2068fb928c14..fb942d6abf833 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -15,8 +15,11 @@ const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : // Dataset path const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json' +// Max tokens to predict +const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512 + export function setup() { - console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`) + console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`) } const data = new SharedArray('conversations', function () { @@ -32,8 +35,6 @@ const data = new SharedArray('conversations', function () { const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens') const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens') -const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds') - const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -43,8 +44,8 @@ const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate' export const options = { thresholds: { llamacpp_completions_truncated_rate: [ - // more than 10% of truncated input will abort the test - {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'}, + // more than 80% of truncated input will abort the test + {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'}, ], }, duration: '10m', @@ -66,6 +67,7 @@ export default function () { ], "model": model, "stream": false, + "max_tokens": max_tokens } const body = JSON.stringify(payload) @@ -79,21 +81,22 @@ export default function () { check(res, {'success completion': (r) => r.status === 200}) - console.debug(`response: ${res.body}`) - - const completions = res.json() + if (res.status === 200) { + console.debug(`response: ${res.body}`) - llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) - llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) + const completions = res.json() - llamacpp_completion_tokens.add(completions.usage.completion_tokens) - llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) + llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) - llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3) - - llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') - llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + llamacpp_completion_tokens.add(completions.usage.completion_tokens) + llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) + llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') + llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + } else { + console.error(`response: ${res.body}`) + } sleep(0.3) } From f425240e1dc40db18cfffbabbbe0756761518807 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 01:23:52 +0100 Subject: [PATCH 05/14] server: bench: fix doc --- examples/server/bench/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 0c8f6b5161c34..481dc5c4d322d 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -53,7 +53,8 @@ The benchmark values can be overridden with: - `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1` - `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480` - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` -- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024` +- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512` +- `SERVER_BENCH_DATASET` path to the benchmark dataset file Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): From bed1cdda9a8057e04075c53b0dd5e76bceda714d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 08:58:22 +0100 Subject: [PATCH 06/14] server: bench: change gauge custom metrics to trend --- examples/server/bench/script.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index fb942d6abf833..d076b8c34f320 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -1,7 +1,7 @@ import http from 'k6/http' import {check, sleep} from 'k6' import {SharedArray} from 'k6/data' -import {Counter, Gauge, Rate} from 'k6/metrics' +import {Counter, Rate, Trend} from 'k6/metrics' // Server chat completions prefix const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1' @@ -32,8 +32,8 @@ const data = new SharedArray('conversations', function () { .slice(0, n_prompt) }) -const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens') -const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens') +const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') +const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') From 572758a665e9435ec235ae6c788ed2e3f099d8cc Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 09:15:15 +0100 Subject: [PATCH 07/14] server: bench: change gauge custom metrics to trend server: bench: add trend custom metrics for total tokens per second average --- examples/server/bench/README.md | 13 +++++++------ examples/server/bench/script.js | 3 +++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 481dc5c4d322d..6e1709ee52a9e 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -64,13 +64,14 @@ SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vu #### Metrics -Following metrics are available: -- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens` -- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens` -- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens` -- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens` +Following metrics are available computed from the OAI chat completions response `usage`: +- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration` +- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens` +- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens` +- `llamacpp_completion_tokens` Trend of `usage.completion_tokens` +- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens` - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'` -- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'` +- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'` The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`. diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index d076b8c34f320..94b8aa94a6a80 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -34,6 +34,7 @@ const data = new SharedArray('conversations', function () { const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') +const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -94,6 +95,8 @@ export default function () { llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + + llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) } else { console.error(`response: ${res.body}`) } From 06e225f843cabca482f8f0a7622cfe60ff1e919c Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 09:55:11 +0100 Subject: [PATCH 08/14] server: bench: doc add an option to debug http request --- examples/server/bench/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 6e1709ee52a9e..108eb56ba0534 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -62,6 +62,8 @@ Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8 ``` +To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"` + #### Metrics Following metrics are available computed from the OAI chat completions response `usage`: From a4b0d107d3c55a61a721a9a355c954fcecd3bdd8 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 09:56:31 +0100 Subject: [PATCH 09/14] server: bench: filter dataset too short and too long sequences --- examples/server/bench/script.js | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index 94b8aa94a6a80..7cd44e070238a 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -23,11 +23,23 @@ export function setup() { } const data = new SharedArray('conversations', function () { + const tokenizer = (message) => message.split(" ") + return JSON.parse(open(dataset_path)) // Filter out the conversations with less than 2 turns. .filter(data => data["conversations"].length >= 2) // Only keep the first two turns of each conversation. - .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"])) + .map(data => { + return { + prompt: data["conversations"][0]["value"], + n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length, + n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length, + } + }) + // Filter out too short sequences + .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4) + // Filter out too long sequences. + .filter(conv => conv.n_prompt_tokens <= 1024 && conv.n_prompt_tokens + conv.n_completion_tokens <= 2048) // Keep only first n prompts .slice(0, n_prompt) }) @@ -59,11 +71,11 @@ export default function () { "messages": [ { "role": "system", - "content": conversation[0], + "content": "You are ChatGPT, an AI assistant.", }, { "role": "user", - "content": conversation[1], + "content": conversation.prompt, } ], "model": model, @@ -73,8 +85,6 @@ export default function () { const body = JSON.stringify(payload) - console.debug(`request: ${body}`) - let res = http.post(`${server_url}/chat/completions`, body, { headers: {'Content-Type': 'application/json'}, timeout: '300s' @@ -83,8 +93,6 @@ export default function () { check(res, {'success completion': (r) => r.status === 200}) if (res.status === 200) { - console.debug(`response: ${res.body}`) - const completions = res.json() llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) From 29c635b41198c1be4db777acb71b4fd0c0fff739 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 10:57:14 +0100 Subject: [PATCH 10/14] server: bench: allow to filter out conversation in the dataset based on env variable --- examples/server/bench/README.md | 6 +++++- examples/server/bench/script.js | 12 +++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 108eb56ba0534..a53ad64d7359b 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -55,6 +55,10 @@ The benchmark values can be overridden with: - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` - `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512` - `SERVER_BENCH_DATASET` path to the benchmark dataset file +- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024` +- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048` + +Note: the local tokenizer is just a string space split, real number of tokens will differ. Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): @@ -62,7 +66,7 @@ Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8 ``` -To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"` +To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`. #### Metrics diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index 7cd44e070238a..3a0594e2370d6 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -18,12 +18,18 @@ const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : ' // Max tokens to predict const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512 +// Max prompt tokens +const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024 + +// Max slot context +const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048 + export function setup() { console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`) } const data = new SharedArray('conversations', function () { - const tokenizer = (message) => message.split(" ") + const tokenizer = (message) => message.split(/[\s,'".?]/) return JSON.parse(open(dataset_path)) // Filter out the conversations with less than 2 turns. @@ -39,7 +45,7 @@ const data = new SharedArray('conversations', function () { // Filter out too short sequences .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4) // Filter out too long sequences. - .filter(conv => conv.n_prompt_tokens <= 1024 && conv.n_prompt_tokens + conv.n_completion_tokens <= 2048) + .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot) // Keep only first n prompts .slice(0, n_prompt) }) @@ -106,7 +112,7 @@ export default function () { llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) } else { - console.error(`response: ${res.body}`) + console.error(`response: ${res.body} request=${payload}`) } sleep(0.3) From ba7114c0e88d46ba8303d367cfd641690ce6bea0 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 10:57:33 +0100 Subject: [PATCH 11/14] server: bench: fix assistant message sent instead of user message --- examples/server/bench/script.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index 3a0594e2370d6..510a01cb57366 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -37,7 +37,7 @@ const data = new SharedArray('conversations', function () { // Only keep the first two turns of each conversation. .map(data => { return { - prompt: data["conversations"][0]["value"], + prompt: data["conversations"][1]["value"], n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length, n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length, } From c4d1b5aaf143aab9dca8f295290539f087e6ba46 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 11:04:27 +0100 Subject: [PATCH 12/14] server: bench: fix assistant message sent instead of user message --- examples/server/bench/script.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index 510a01cb57366..49ed9db8e898c 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -34,10 +34,10 @@ const data = new SharedArray('conversations', function () { return JSON.parse(open(dataset_path)) // Filter out the conversations with less than 2 turns. .filter(data => data["conversations"].length >= 2) - // Only keep the first two turns of each conversation. + .filter(data => data["conversations"][0]["from"] === "human") .map(data => { return { - prompt: data["conversations"][1]["value"], + prompt: data["conversations"][0]["value"], n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length, n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length, } From 52c76d57a582c5e2a3202056cf47ea04059484f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 20:44:35 +0200 Subject: [PATCH 13/14] server : add defrag thold parameter --- examples/server/server.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2374b7e4ab232..e1efebecc0eee 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2133,6 +2133,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n"); + printf(" -dt N, --defrag-thold N\n"); + printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); @@ -2354,6 +2356,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } else { invalid_param = true; break; } + } else if (arg == "--defrag-thold" || arg == "-dt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.defrag_thold = std::stof(argv[i]); } else if (arg == "--threads" || arg == "-t") { if (++i >= argc) { From 6bfb80eb7576ea201886b3152f1b16e64c37eab5 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 22:54:24 +0100 Subject: [PATCH 14/14] server: bench: select prompts based on the current iteration id not randomly to make the bench more reproducible --- examples/server/bench/script.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index 49ed9db8e898c..a4f5ac5ab22ad 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -2,6 +2,7 @@ import http from 'k6/http' import {check, sleep} from 'k6' import {SharedArray} from 'k6/data' import {Counter, Rate, Trend} from 'k6/metrics' +import exec from 'k6/execution'; // Server chat completions prefix const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1' @@ -72,7 +73,7 @@ export const options = { } export default function () { - const conversation = data[Math.floor(Math.random() * data.length)] + const conversation = data[exec.scenario.iterationInInstance % data.length] const payload = { "messages": [ {