From 68d1d8fe28e9887c7e3eff4f714d3faa7def3081 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 8 Mar 2024 13:16:16 +0100
Subject: [PATCH 01/14] server: bench: Init a bench scenario with K6 See #5827

---
 examples/server/bench/README.md | 64 +++++++++++++++++++++++++
 examples/server/bench/script.js | 84 +++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 examples/server/bench/README.md
 create mode 100644 examples/server/bench/script.js

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
new file mode 100644
index 0000000000000..049d233174cf8
--- /dev/null
+++ b/examples/server/bench/README.md
@@ -0,0 +1,64 @@
+### Server benchmark tools
+
+Benchmark is using [k6](https://k6.io/).
+
+##### Install k6 - ubuntu
+```shell
+snap install k6
+```
+
+#### Downloading the ShareGPT dataset
+
+```shell
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+#### Download a model
+Example for PHI-2
+
+```shell
+../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
+```
+
+#### Start the server
+The server must listen on `localhost:8080`.
+
+Example:
+```shell
+server --host localhost --port 8080 \
+  --model ggml-model-q4_0.gguf \
+  --cont-batching \
+  --metrics \
+  --parallel 8 \
+  --batch-size 512 \
+  --ctx-size 4096 \
+  --log-format text \
+  -ngl 33
+```
+
+#### Run the bench
+```shell
+k6 run script.js
+```
+
+#### Change the number of concurrent user
+in the `script.js`, change the ramping period according to your number of slots.
+
+#### Metrics
+
+Following metrics are available:
+- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens`
+- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
+- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
+- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
+- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
+- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
+- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
+
+The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
+
+K6 metrics might be compared against [server metrics](../README.md), with:
+
+```shell
+curl http://localhost:8080/metrics
+```
\ No newline at end of file
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
new file mode 100644
index 0000000000000..9d963e49de08b
--- /dev/null
+++ b/examples/server/bench/script.js
@@ -0,0 +1,84 @@
+import http from 'k6/http';
+import { check, sleep } from 'k6';
+import { SharedArray } from 'k6/data';
+import { Counter, Gauge, Rate } from 'k6/metrics';
+
+const data = new SharedArray('conversations', function () {
+    return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
+
+        // Filter out the conversations with less than 2 turns.
+        .filter(data => data["conversations"].length >= 2)
+        // Only keep the first two turns of each conversation.
+        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
+});
+
+const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
+const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
+
+const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
+
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
+
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
+
+export const options = {
+    thresholds: {
+        llamacpp_completions_truncated_rate: [
+            // more than 10% of truncated input will abort the test
+            { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
+        ],
+    },
+    scenarios: {
+        completions: {
+            executor: 'ramping-vus',
+            startVUs: 1,
+            stages: [
+                {duration: '1m', target: 8},
+                {duration: '3m', target: 8},
+                {duration: '1m', target: 0},
+            ],
+            gracefulRampDown: '30s',
+        },
+    },
+};
+
+export default function () {
+    const conversation = data[0]
+    const payload = {
+        "messages": [
+            {
+                "role": "system",
+                "content": conversation[0],
+            },
+            {
+                "role": "user",
+                "content": conversation[1],
+            }
+        ],
+        "model": "model",
+        "stream": false,
+    }
+    let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
+        headers: { 'Content-Type': 'application/json' },
+    })
+
+    check(res, {'success completion': (r) => r.status === 200})
+
+    const completions = res.json()
+
+    llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
+    llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
+
+    llamacpp_completion_tokens.add(completions.usage.completion_tokens)
+    llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
+
+    llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
+
+    llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
+    llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+
+
+    sleep(0.3)
+}
\ No newline at end of file

From 0b822b6a0f3cfaaef26a79a8f3134df1749b948a Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 8 Mar 2024 19:49:49 +0100
Subject: [PATCH 02/14] server: bench: EOL EOF

---
 examples/server/bench/README.md | 2 +-
 examples/server/bench/script.js | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 049d233174cf8..b8edc8587e04a 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -61,4 +61,4 @@ K6 metrics might be compared against [server metrics](../README.md), with:
 
 ```shell
 curl http://localhost:8080/metrics
-```
\ No newline at end of file
+```
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index 9d963e49de08b..c52eb182a885a 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -81,4 +81,4 @@ export default function () {
 
 
     sleep(0.3)
-}
\ No newline at end of file
+}

From 548bc9635a8326406f6a0382731902e132a3532d Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 00:13:54 +0100
Subject: [PATCH 03/14] server: bench: PR feedback and improved k6 script
 configuration

---
 examples/server/bench/README.md | 27 ++++++++---
 examples/server/bench/script.js | 81 +++++++++++++++++++--------------
 2 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index b8edc8587e04a..67367b8101745 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -2,12 +2,18 @@
 
 Benchmark is using [k6](https://k6.io/).
 
-##### Install k6 - ubuntu
+##### Install k6
+
+Follow instruction from: https://k6.io/docs/get-started/installation/
+
+Example for ubuntu:
 ```shell
 snap install k6
 ```
 
-#### Downloading the ShareGPT dataset
+#### Download a dataset
+
+This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
 
 ```shell
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
@@ -21,7 +27,7 @@ Example for PHI-2
 ```
 
 #### Start the server
-The server must listen on `localhost:8080`.
+The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
 
 Example:
 ```shell
@@ -36,13 +42,22 @@ server --host localhost --port 8080 \
   -ngl 33
 ```
 
-#### Run the bench
+#### Run the benchmark
+
 ```shell
 k6 run script.js
 ```
 
-#### Change the number of concurrent user
-in the `script.js`, change the ramping period according to your number of slots.
+The benchmark values can be overridden with:
+- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
+- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
+- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
+
+Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
+
+```shell
+SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
+```
 
 #### Metrics
 
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index c52eb182a885a..e2068fb928c14 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -1,51 +1,58 @@
-import http from 'k6/http';
-import { check, sleep } from 'k6';
-import { SharedArray } from 'k6/data';
-import { Counter, Gauge, Rate } from 'k6/metrics';
+import http from 'k6/http'
+import {check, sleep} from 'k6'
+import {SharedArray} from 'k6/data'
+import {Counter, Gauge, Rate} from 'k6/metrics'
 
-const data = new SharedArray('conversations', function () {
-    return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
+// Server chat completions prefix
+const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
+
+// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
+const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
+
+// Model name to request
+const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
+
+// Dataset path
+const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
 
+export function setup() {
+    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
+}
+
+const data = new SharedArray('conversations', function () {
+    return JSON.parse(open(dataset_path))
         // Filter out the conversations with less than 2 turns.
         .filter(data => data["conversations"].length >= 2)
         // Only keep the first two turns of each conversation.
-        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
-});
+        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]))
+        // Keep only first n prompts
+        .slice(0, n_prompt)
+})
 
-const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
-const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
+const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
+const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
 
-const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
+const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
 
-const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
-const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
 
-const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
-const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
 
 export const options = {
     thresholds: {
         llamacpp_completions_truncated_rate: [
             // more than 10% of truncated input will abort the test
-            { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
+            {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
         ],
     },
-    scenarios: {
-        completions: {
-            executor: 'ramping-vus',
-            startVUs: 1,
-            stages: [
-                {duration: '1m', target: 8},
-                {duration: '3m', target: 8},
-                {duration: '1m', target: 0},
-            ],
-            gracefulRampDown: '30s',
-        },
-    },
-};
+    duration: '10m',
+    vus: 8,
+}
 
 export default function () {
-    const conversation = data[0]
+    const conversation = data[Math.floor(Math.random() * data.length)]
     const payload = {
         "messages": [
             {
@@ -57,15 +64,23 @@ export default function () {
                 "content": conversation[1],
             }
         ],
-        "model": "model",
+        "model": model,
         "stream": false,
     }
-    let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
-        headers: { 'Content-Type': 'application/json' },
+
+    const body = JSON.stringify(payload)
+
+    console.debug(`request: ${body}`)
+
+    let res = http.post(`${server_url}/chat/completions`, body, {
+        headers: {'Content-Type': 'application/json'},
+        timeout: '300s'
     })
 
     check(res, {'success completion': (r) => r.status === 200})
 
+    console.debug(`response: ${res.body}`)
+
     const completions = res.json()
 
     llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)

From ab0a59d6d32e4a90a9c8099bbb73dea2a225e9bc Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 01:09:56 +0100
Subject: [PATCH 04/14] server: bench: remove
 llamacpp_completions_tokens_seconds as it include prompt processing time and
 it's misleading

server: bench: add max_tokens from SERVER_BENCH_MAX_TOKENS

server: bench: increase truncated rate to 80% before failing
---
 examples/server/bench/README.md |  5 +++--
 examples/server/bench/script.js | 35 ++++++++++++++++++---------------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 67367b8101745..0c8f6b5161c34 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -44,14 +44,16 @@ server --host localhost --port 8080 \
 
 #### Run the benchmark
 
+For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
 ```shell
-k6 run script.js
+k6 run script.js --duration 10m --iterations 500 --vus 8
 ```
 
 The benchmark values can be overridden with:
 - `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
 - `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
 - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
+- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024`
 
 Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 
@@ -66,7 +68,6 @@ Following metrics are available:
 - `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
 - `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
 - `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
-- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
 - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
 - `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
 
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index e2068fb928c14..fb942d6abf833 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -15,8 +15,11 @@ const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS :
 // Dataset path
 const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
 
+// Max tokens to predict
+const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
+
 export function setup() {
-    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
+    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
 }
 
 const data = new SharedArray('conversations', function () {
@@ -32,8 +35,6 @@ const data = new SharedArray('conversations', function () {
 const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
 const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
 
-const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
-
 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
 
@@ -43,8 +44,8 @@ const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'
 export const options = {
     thresholds: {
         llamacpp_completions_truncated_rate: [
-            // more than 10% of truncated input will abort the test
-            {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
+            // more than 80% of truncated input will abort the test
+            {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
         ],
     },
     duration: '10m',
@@ -66,6 +67,7 @@ export default function () {
         ],
         "model": model,
         "stream": false,
+        "max_tokens": max_tokens
     }
 
     const body = JSON.stringify(payload)
@@ -79,21 +81,22 @@ export default function () {
 
     check(res, {'success completion': (r) => r.status === 200})
 
-    console.debug(`response: ${res.body}`)
-
-    const completions = res.json()
+    if (res.status === 200) {
+        console.debug(`response: ${res.body}`)
 
-    llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
-    llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
+        const completions = res.json()
 
-    llamacpp_completion_tokens.add(completions.usage.completion_tokens)
-    llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
+        llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
+        llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
 
-    llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
-
-    llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
-    llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+        llamacpp_completion_tokens.add(completions.usage.completion_tokens)
+        llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
 
+        llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
+        llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+    } else {
+        console.error(`response: ${res.body}`)
+    }
 
     sleep(0.3)
 }

From f425240e1dc40db18cfffbabbbe0756761518807 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 01:23:52 +0100
Subject: [PATCH 05/14] server: bench: fix doc

---
 examples/server/bench/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 0c8f6b5161c34..481dc5c4d322d 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -53,7 +53,8 @@ The benchmark values can be overridden with:
 - `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
 - `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
 - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
-- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024`
+- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
+- `SERVER_BENCH_DATASET` path to the benchmark dataset file
 
 Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 

From bed1cdda9a8057e04075c53b0dd5e76bceda714d Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 08:58:22 +0100
Subject: [PATCH 06/14] server: bench: change gauge custom metrics to trend

---
 examples/server/bench/script.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index fb942d6abf833..d076b8c34f320 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -1,7 +1,7 @@
 import http from 'k6/http'
 import {check, sleep} from 'k6'
 import {SharedArray} from 'k6/data'
-import {Counter, Gauge, Rate} from 'k6/metrics'
+import {Counter, Rate, Trend} from 'k6/metrics'
 
 // Server chat completions prefix
 const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
@@ -32,8 +32,8 @@ const data = new SharedArray('conversations', function () {
         .slice(0, n_prompt)
 })
 
-const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
-const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
+const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
+const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
 
 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')

From 572758a665e9435ec235ae6c788ed2e3f099d8cc Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 09:15:15 +0100
Subject: [PATCH 07/14] server: bench: change gauge custom metrics to trend
 server: bench: add trend custom metrics for total tokens per second average

---
 examples/server/bench/README.md | 13 +++++++------
 examples/server/bench/script.js |  3 +++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 481dc5c4d322d..6e1709ee52a9e 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -64,13 +64,14 @@ SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vu
 
 #### Metrics
 
-Following metrics are available:
-- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens`
-- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
-- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
-- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
+Following metrics are available computed from the OAI chat completions response `usage`:
+- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration`
+- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens`
+- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
+- `llamacpp_completion_tokens` Trend of `usage.completion_tokens`
+- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
 - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
-- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
+- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
 
 The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
 
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index d076b8c34f320..94b8aa94a6a80 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -34,6 +34,7 @@ const data = new SharedArray('conversations', function () {
 
 const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
 const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
 
 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -94,6 +95,8 @@ export default function () {
 
         llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
         llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+
+        llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
     } else {
         console.error(`response: ${res.body}`)
     }

From 06e225f843cabca482f8f0a7622cfe60ff1e919c Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 09:55:11 +0100
Subject: [PATCH 08/14] server: bench: doc add an option to debug http request

---
 examples/server/bench/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 6e1709ee52a9e..108eb56ba0534 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -62,6 +62,8 @@ Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
 ```
 
+To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`
+
 #### Metrics
 
 Following metrics are available computed from the OAI chat completions response `usage`:

From a4b0d107d3c55a61a721a9a355c954fcecd3bdd8 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 09:56:31 +0100
Subject: [PATCH 09/14] server: bench: filter dataset too short and too long
 sequences

---
 examples/server/bench/script.js | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index 94b8aa94a6a80..7cd44e070238a 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -23,11 +23,23 @@ export function setup() {
 }
 
 const data = new SharedArray('conversations', function () {
+    const tokenizer = (message) => message.split(" ")
+
     return JSON.parse(open(dataset_path))
         // Filter out the conversations with less than 2 turns.
         .filter(data => data["conversations"].length >= 2)
         // Only keep the first two turns of each conversation.
-        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]))
+        .map(data => {
+            return {
+                prompt: data["conversations"][0]["value"],
+                n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
+                n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
+            }
+        })
+        // Filter out too short sequences
+        .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
+        // Filter out too long sequences.
+        .filter(conv => conv.n_prompt_tokens <= 1024 && conv.n_prompt_tokens + conv.n_completion_tokens <= 2048)
         // Keep only first n prompts
         .slice(0, n_prompt)
 })
@@ -59,11 +71,11 @@ export default function () {
         "messages": [
             {
                 "role": "system",
-                "content": conversation[0],
+                "content": "You are ChatGPT, an AI assistant.",
             },
             {
                 "role": "user",
-                "content": conversation[1],
+                "content": conversation.prompt,
             }
         ],
         "model": model,
@@ -73,8 +85,6 @@ export default function () {
 
     const body = JSON.stringify(payload)
 
-    console.debug(`request: ${body}`)
-
     let res = http.post(`${server_url}/chat/completions`, body, {
         headers: {'Content-Type': 'application/json'},
         timeout: '300s'
@@ -83,8 +93,6 @@ export default function () {
     check(res, {'success completion': (r) => r.status === 200})
 
     if (res.status === 200) {
-        console.debug(`response: ${res.body}`)
-
         const completions = res.json()
 
         llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)

From 29c635b41198c1be4db777acb71b4fd0c0fff739 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 10:57:14 +0100
Subject: [PATCH 10/14] server: bench: allow to filter out conversation in the
 dataset based on env variable

---
 examples/server/bench/README.md |  6 +++++-
 examples/server/bench/script.js | 12 +++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 108eb56ba0534..a53ad64d7359b 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -55,6 +55,10 @@ The benchmark values can be overridden with:
 - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
 - `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
 - `SERVER_BENCH_DATASET` path to the benchmark dataset file
+- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024`
+- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048`
+
+Note: the local tokenizer is just a string space split, real number of tokens will differ.
 
 Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 
@@ -62,7 +66,7 @@ Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
 ```
 
-To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`
+To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`.
 
 #### Metrics
 
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index 7cd44e070238a..3a0594e2370d6 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -18,12 +18,18 @@ const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : '
 // Max tokens to predict
 const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
 
+// Max prompt tokens
+const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
+
+// Max slot context
+const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
+
 export function setup() {
     console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
 }
 
 const data = new SharedArray('conversations', function () {
-    const tokenizer = (message) => message.split(" ")
+    const tokenizer = (message) => message.split(/[\s,'".?]/)
 
     return JSON.parse(open(dataset_path))
         // Filter out the conversations with less than 2 turns.
@@ -39,7 +45,7 @@ const data = new SharedArray('conversations', function () {
         // Filter out too short sequences
         .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
         // Filter out too long sequences.
-        .filter(conv => conv.n_prompt_tokens <= 1024 && conv.n_prompt_tokens + conv.n_completion_tokens <= 2048)
+        .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
         // Keep only first n prompts
         .slice(0, n_prompt)
 })
@@ -106,7 +112,7 @@ export default function () {
 
         llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
     } else {
-        console.error(`response: ${res.body}`)
+        console.error(`response: ${res.body} request=${payload}`)
     }
 
     sleep(0.3)

From ba7114c0e88d46ba8303d367cfd641690ce6bea0 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 10:57:33 +0100
Subject: [PATCH 11/14] server: bench: fix assistant message sent instead of
 user message

---
 examples/server/bench/script.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index 3a0594e2370d6..510a01cb57366 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -37,7 +37,7 @@ const data = new SharedArray('conversations', function () {
         // Only keep the first two turns of each conversation.
         .map(data => {
             return {
-                prompt: data["conversations"][0]["value"],
+                prompt: data["conversations"][1]["value"],
                 n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
                 n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
             }

From c4d1b5aaf143aab9dca8f295290539f087e6ba46 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 11:04:27 +0100
Subject: [PATCH 12/14] server: bench: fix assistant message sent instead of
 user message

---
 examples/server/bench/script.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index 510a01cb57366..49ed9db8e898c 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -34,10 +34,10 @@ const data = new SharedArray('conversations', function () {
     return JSON.parse(open(dataset_path))
         // Filter out the conversations with less than 2 turns.
         .filter(data => data["conversations"].length >= 2)
-        // Only keep the first two turns of each conversation.
+        .filter(data => data["conversations"][0]["from"] === "human")
         .map(data => {
             return {
-                prompt: data["conversations"][1]["value"],
+                prompt: data["conversations"][0]["value"],
                 n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
                 n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
             }

From 52c76d57a582c5e2a3202056cf47ea04059484f7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 9 Mar 2024 20:44:35 +0200
Subject: [PATCH 13/14] server : add defrag thold parameter

---
 examples/server/server.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2374b7e4ab232..e1efebecc0eee 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2133,6 +2133,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
     printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
     printf("  --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
+    printf("  -dt N, --defrag-thold N\n");
+    printf("                            KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
     printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
     printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
@@ -2354,6 +2356,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
             else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
             else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
             else { invalid_param = true; break; }
+        } else if (arg == "--defrag-thold" || arg == "-dt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.defrag_thold = std::stof(argv[i]);
         } else if (arg == "--threads" || arg == "-t") {
             if (++i >= argc)
             {

From 6bfb80eb7576ea201886b3152f1b16e64c37eab5 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 22:54:24 +0100
Subject: [PATCH 14/14] server: bench: select prompts based on the current
 iteration id not randomly to make the bench more reproducible

---
 examples/server/bench/script.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index 49ed9db8e898c..a4f5ac5ab22ad 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -2,6 +2,7 @@ import http from 'k6/http'
 import {check, sleep} from 'k6'
 import {SharedArray} from 'k6/data'
 import {Counter, Rate, Trend} from 'k6/metrics'
+import exec from 'k6/execution';
 
 // Server chat completions prefix
 const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
@@ -72,7 +73,7 @@ export const options = {
 }
 
 export default function () {
-    const conversation = data[Math.floor(Math.random() * data.length)]
+    const conversation = data[exec.scenario.iterationInInstance % data.length]
     const payload = {
         "messages": [
             {