From 3e9026437b881c3da0303b77fea760de15a7c8f4 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Wed, 26 Mar 2025 17:39:02 +0000 Subject: [PATCH] Updating llama 2 7b to llama 3.1 8b Instruct and adding new LoRA adapters --- config/charts/inferencepool/README.md | 14 +++---- config/charts/inferencepool/values.yaml | 2 +- config/manifests/benchmark/benchmark.yaml | 4 +- config/manifests/gateway/patch_policy.yaml | 2 +- config/manifests/inferencemodel.yaml | 14 +++---- config/manifests/inferencepool.yaml | 20 +++++----- config/manifests/vllm/cpu-deployment.yaml | 14 +++---- config/manifests/vllm/gpu-deployment.yaml | 31 +++++++------- hack/test-e2e.sh | 4 +- pkg/epp/datastore/datastore_test.go | 8 ++-- pkg/epp/handlers/response.go | 4 +- pkg/epp/handlers/response_test.go | 6 +-- site-src/guides/adapter-rollout.md | 40 +++++++++---------- site-src/guides/index.md | 10 ++--- site-src/guides/metrics.md | 2 +- test/e2e/epp/README.md | 4 +- test/e2e/epp/e2e_suite_test.go | 6 +-- test/integration/epp/hermetic_test.go | 30 +++++++------- test/testdata/envoy.yaml | 4 +- .../inferencepool-with-model-hermetic.yaml | 12 +++--- tools/dynamic-lora-sidecar/deployment.yaml | 10 ++--- .../sidecar/test_sidecar.py | 14 +++---- 22 files changed, 127 insertions(+), 128 deletions(-) diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 12f9959c..30087527 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -5,12 +5,12 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) depl ## Install -To install an InferencePool named `vllm-llama2-7b` that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command: +To install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command: ```txt -$ helm install vllm-llama2-7b ./config/charts/inferencepool \ - --set inferencePool.name=vllm-llama2-7b \ - --set inferencePool.modelServers.matchLabels.app=vllm-llama2-7b \ +$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool \ + --set inferencePool.name=vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set inferencePool.targetPortNumber=8000 ``` @@ -19,9 +19,9 @@ where `inferencePool.targetPortNumber` is the pod that vllm backends served on a To install via the latest published chart in staging (--version v0 indicates latest dev version), you can run the following command: ```txt -$ helm install vllm-llama2-7b \ - --set inferencePool.name=vllm-llama2-7b \ - --set inferencePool.modelServers.matchLabels.app=vllm-llama2-7b \ +$ helm install vllm-llama3-8b-instruct \ + --set inferencePool.name=vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set inferencePool.targetPortNumber=8000 \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 ``` diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 5cece88c..7b0c8f96 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -12,4 +12,4 @@ inferencePool: targetPortNumber: 8000 # modelServers: # REQUIRED # matchLabels: - # app: vllm-llama2-7b + # app: vllm-llama3-8b-instruct diff --git a/config/manifests/benchmark/benchmark.yaml b/config/manifests/benchmark/benchmark.yaml index a47b4617..c784730e 100644 --- a/config/manifests/benchmark/benchmark.yaml +++ b/config/manifests/benchmark/benchmark.yaml @@ -31,9 +31,9 @@ spec: - name: BENCHMARK_TIME_SECONDS value: '60' - name: TOKENIZER - value: 'meta-llama/Llama-2-7b-hf' + value: 'meta-llama/Llama-3.1-8B-Instruct' - name: MODELS - value: 'meta-llama/Llama-2-7b-hf' + value: 'meta-llama/Llama-3.1-8B-Instruct' - name: BACKEND value: vllm - name: PORT diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml index a40c8e27..923ce22c 100644 --- a/config/manifests/gateway/patch_policy.yaml +++ b/config/manifests/gateway/patch_policy.yaml @@ -99,7 +99,7 @@ spec: - backendRefs: - group: "" kind: Service - name: vllm-llama2-7b-epp + name: vllm-llama3-8b-instruct-epp port: 9002 processingMode: allowModeOverride: true diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 4c7824ca..bdd4405a 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -3,12 +3,12 @@ kind: InferenceModel metadata: name: inferencemodel-sample spec: - modelName: tweet-summary - criticality: Critical + modelName: food-review + criticality: Standard poolRef: - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct targetModels: - - name: tweet-summary-1 + - name: food-review-1 weight: 100 --- @@ -17,10 +17,10 @@ kind: InferenceModel metadata: name: inferencemodel-base-model spec: - modelName: meta-llama/Llama-2-7b-hf + modelName: meta-llama/Llama-3.1-8B-Instruct criticality: Critical poolRef: - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct --- apiVersion: inference.networking.x-k8s.io/v1alpha2 @@ -31,4 +31,4 @@ spec: modelName: Qwen/Qwen2.5-1.5B-Instruct criticality: Critical poolRef: - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool.yaml index def892f5..639157c1 100644 --- a/config/manifests/inferencepool.yaml +++ b/config/manifests/inferencepool.yaml @@ -2,22 +2,22 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: labels: - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct spec: targetPortNumber: 8000 selector: - app: vllm-llama2-7b + app: vllm-llama3-8b-instruct extensionRef: - name: vllm-llama2-7b-epp + name: vllm-llama3-8b-instruct-epp --- apiVersion: v1 kind: Service metadata: - name: vllm-llama2-7b-epp + name: vllm-llama3-8b-instruct-epp namespace: default spec: selector: - app: vllm-llama2-7b-epp + app: vllm-llama3-8b-instruct-epp ports: - protocol: TCP port: 9002 @@ -27,19 +27,19 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-llama2-7b-epp + name: vllm-llama3-8b-instruct-epp namespace: default labels: - app: vllm-llama2-7b-epp + app: vllm-llama3-8b-instruct-epp spec: replicas: 1 selector: matchLabels: - app: vllm-llama2-7b-epp + app: vllm-llama3-8b-instruct-epp template: metadata: labels: - app: vllm-llama2-7b-epp + app: vllm-llama3-8b-instruct-epp spec: containers: - name: epp @@ -47,7 +47,7 @@ spec: imagePullPolicy: Always args: - -poolName - - "vllm-llama2-7b" + - "vllm-llama3-8b-instruct" - -v - "4" - --zap-encoder diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml index 6ac1014c..6fb40950 100644 --- a/config/manifests/vllm/cpu-deployment.yaml +++ b/config/manifests/vllm/cpu-deployment.yaml @@ -1,16 +1,16 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct spec: replicas: 3 selector: matchLabels: - app: vllm-llama2-7b + app: vllm-llama3-8b-instruct template: metadata: labels: - app: vllm-llama2-7b + app: vllm-llama3-8b-instruct spec: containers: - name: lora @@ -26,8 +26,8 @@ spec: - "--max-loras" - "4" - "--lora-modules" - - '{"name": "tweet-summary-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}' - - '{"name": "tweet-summary-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}' + - '{"name": "food-review-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}' + - '{"name": "food-review-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}' env: - name: PORT value: "8000" @@ -108,10 +108,10 @@ metadata: data: configmap.yaml: | vLLMLoRAConfig: - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct port: 8000 ensureExist: models: - base-model: Qwen/Qwen2.5-1.5B - id: tweet-summary-1 + id: food-review-1 source: SriSanth2345/Qwen-1.5B-Tweet-Generations \ No newline at end of file diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml index e9507601..c405b33c 100644 --- a/config/manifests/vllm/gpu-deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -1,37 +1,34 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct spec: replicas: 3 selector: matchLabels: - app: vllm-llama2-7b + app: vllm-llama3-8b-instruct template: metadata: labels: - app: vllm-llama2-7b + app: vllm-llama3-8b-instruct spec: containers: - - name: lora + - name: vllm image: "vllm/vllm-openai:latest" imagePullPolicy: Always command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - "--model" - - "meta-llama/Llama-2-7b-hf" + - "meta-llama/Llama-3.1-8B-Instruct" - "--tensor-parallel-size" - "1" - "--port" - "8000" - "--enable-lora" - "--max-loras" - - "4" + - "2" - "--max-cpu-loras" - "12" - - "--lora-modules" - - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' - - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' env: # Enabling LoRA support temporarily disables automatic v1, we want to force it on # until 0.8.3 vLLM is released. @@ -238,20 +235,22 @@ spec: emptyDir: {} - name: config-volume configMap: - name: vllm-llama2-7b-adapters + name: vllm-llama3.1-8b-adapters --- apiVersion: v1 kind: ConfigMap metadata: - name: vllm-llama2-7b-adapters + name: vllm-llama3.1-8b-adapters data: configmap.yaml: | vLLMLoRAConfig: - name: vllm-llama2-7b + name: vllm-llama3.1-8b-instruct port: 8000 ensureExist: models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - + - base-model: meta-llama/Llama-3.1-8B-Instruct + id: food-review + source: Kawon/llama3.1-food-finetune_v14_r8 + - base-model: meta-llama/Llama-3.1-8B-Instruct + id: cad-fabricator + source: redcathode/fabricator diff --git a/hack/test-e2e.sh b/hack/test-e2e.sh index 716e626a..0d6bdfc0 100755 --- a/hack/test-e2e.sh +++ b/hack/test-e2e.sh @@ -124,14 +124,14 @@ if [[ "$CURL_POD" == "true" ]]; then while [ $SECONDS -lt $end ]; do kubectl exec po/curl -- curl -i "$IP:$PORT/v1/completions" \ -H 'Content-Type: application/json' \ - -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' + -d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' sleep 5 done else while [ $SECONDS -lt $end ]; do curl -i "$IP:$PORT/v1/completions" \ -H 'Content-Type: application/json' \ - -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' + -d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' sleep 5 done fi diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index 1a88e5dc..22bb0365 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -97,7 +97,7 @@ func TestPool(t *testing.T) { func TestModel(t *testing.T) { chatModel := "chat" - tsModel := "tweet-summary" + tsModel := "food-review" model1ts := testutil.MakeInferenceModel("model1"). CreationTimestamp(metav1.Unix(1000, 0)). ModelName(tsModel).ObjRef() @@ -126,7 +126,7 @@ func TestModel(t *testing.T) { wantModels []*v1alpha2.InferenceModel }{ { - name: "Add model1 with tweet-summary as modelName", + name: "Add model1 with food-review as modelName", op: func(ds Datastore) bool { return ds.ModelSetIfOlder(model1ts) }, @@ -161,7 +161,7 @@ func TestModel(t *testing.T) { wantModels: []*v1alpha2.InferenceModel{model2ts}, }, { - name: "Set model1 with the tweet-summary modelName, both models should exist", + name: "Set model1 with the food-review modelName, both models should exist", existingModels: []*v1alpha2.InferenceModel{model2chat}, op: func(ds Datastore) bool { return ds.ModelSetIfOlder(model1ts) @@ -170,7 +170,7 @@ func TestModel(t *testing.T) { wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, }, { - name: "Set model1 with the tweet-summary modelName, both models should exist", + name: "Set model1 with the food-review modelName, both models should exist", existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, op: func(ds Datastore) bool { return ds.ModelSetIfOlder(model1ts) diff --git a/pkg/epp/handlers/response.go b/pkg/epp/handlers/response.go index cf64f4a4..991b7d16 100644 --- a/pkg/epp/handlers/response.go +++ b/pkg/epp/handlers/response.go @@ -127,7 +127,7 @@ func (s *Server) HandleResponseHeaders( "id": "cmpl-573498d260f2423f9e42817bbba3743a", "object": "text_completion", "created": 1732563765, - "model": "meta-llama/Llama-2-7b-hf", + "model": "meta-llama/Llama-3.1-8B-Instruct", "choices": [ { "index": 0, @@ -217,7 +217,7 @@ func (s *Server) HandleStreaming( } // Example message if "stream_options": {"include_usage": "true"} is included in the request: -// data: {"id":"...","object":"text_completion","created":1739400043,"model":"tweet-summary-0","choices":[], +// data: {"id":"...","object":"text_completion","created":1739400043,"model":"food-review-0","choices":[], // "usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} // // data: [DONE] diff --git a/pkg/epp/handlers/response_test.go b/pkg/epp/handlers/response_test.go index edfa3edb..074b45c9 100644 --- a/pkg/epp/handlers/response_test.go +++ b/pkg/epp/handlers/response_test.go @@ -31,7 +31,7 @@ const ( "id": "cmpl-573498d260f2423f9e42817bbba3743a", "object": "text_completion", "created": 1732563765, - "model": "meta-llama/Llama-2-7b-hf", + "model": "meta-llama/Llama-3.1-8B-Instruct", "choices": [ { "index": 0, @@ -50,10 +50,10 @@ const ( } ` - streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"tweet-summary-0","choices":[],"usage":null} + streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":null} ` - streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"tweet-summary-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} + streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} data: [DONE] ` ) diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md index 9ce8c3a4..18d60ece 100644 --- a/site-src/guides/adapter-rollout.md +++ b/site-src/guides/adapter-rollout.md @@ -18,7 +18,7 @@ Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version. ```bash - kubectl edit configmap vllm-llama2-7b-adapters + kubectl edit configmap vllm-llama3-8b-instruct-adapters ``` Change the ConfigMap to match the following (note the new entry under models): @@ -27,19 +27,19 @@ Change the ConfigMap to match the following (note the new entry under models): apiVersion: v1 kind: ConfigMap metadata: - name: vllm-llama2-7b-adapters + name: vllm-llama3-8b-instruct-adapters data: configmap.yaml: | vLLMLoRAConfig: - name: vllm-llama2-7b-adapters + name: vllm-llama3-8b-instruct-adapters port: 8000 ensureExist: models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 + - base-model: meta-llama/Llama-3.1-8B-Instruct + id: food-review-1 source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-2 + - base-model: meta-llama/Llama-3.1-8B-Instruct + id: food-review-2 source: mahimairaja/tweet-summarization-llama-2-finetuned ``` @@ -48,11 +48,11 @@ The new adapter version is applied to the model servers live, without requiring ### Direct traffic to the new adapter version -Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for tweet-summary model will be sent to the new ***tweet-summary-2*** adapter. +Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for food-review model will be sent to the new ***food-review-2*** adapter. ```bash - kubectl edit inferencemodel tweet-summary + kubectl edit inferencemodel food-review ``` Change the targetModels list in InferenceModel to match the following: @@ -64,14 +64,14 @@ kind: InferenceModel metadata: name: inferencemodel-sample spec: - modelName: tweet-summary + modelName: food-review criticality: Critical poolRef: - name: vllm-llama2-7b-pool + name: vllm-llama3-8b-instruct-pool targetModels: - - name: tweet-summary-1 + - name: food-review-1 weight: 90 - - name: tweet-summary-2 + - name: food-review-2 weight: 10 ``` @@ -86,7 +86,7 @@ IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].va 2. Send a few requests as follows: ```bash curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ -"model": "tweet-summary", +"model": "food-review", "prompt": "Write as if you were a critic: San Francisco", "max_tokens": 100, "temperature": 0 @@ -100,9 +100,9 @@ Modify the InferenceModel to direct 100% of the traffic to the latest version of ```yaml model: - name: tweet-summary + name: food-review targetModels: - targetModelName: tweet-summary-2 + targetModelName: food-review-2 weight: 100 ``` @@ -120,13 +120,13 @@ Unload the older versions from the servers by updating the LoRA syncer ConfigMap port: 8000 ensureExist: models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-2 + - base-model: meta-llama/Llama-3.1-8B-Instruct + id: food-review-2 source: mahimairaja/tweet-summarization-llama-2-finetuned ensureNotExist: models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 + - base-model: meta-llama/Llama-3.1-8B-Instruct + id: food-review-1 source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm ``` diff --git a/site-src/guides/index.md b/site-src/guides/index.md index bcea5f9b..99b78129 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -17,7 +17,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Two options are supported for running the model server: 1. GPU-based model server. - Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). + Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). 1. CPU-based model server (not using GPUs). The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). @@ -27,11 +27,11 @@ This quickstart guide is intended for engineers familiar with k8s and model serv === "GPU-Based Model Server" For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed. - Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. + Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash - kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 + kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to the set of Llama models kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml ``` @@ -59,7 +59,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy InferenceModel - Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` + Deploy the sample InferenceModel which is configured to load balance traffic between the `food-review-0` and `food-review-1` [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml @@ -116,7 +116,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv PORT=8081 curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ - "model": "tweet-summary", + "model": "food-review", "prompt": "Write as if you were a critic: San Francisco", "max_tokens": 100, "temperature": 0 diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md index a904145d..12ff892e 100644 --- a/site-src/guides/metrics.md +++ b/site-src/guides/metrics.md @@ -29,7 +29,7 @@ If you want to include usage metrics for vLLM model server streaming request, se ``` curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ -"model": "tweet-summary", +"model": "food-review", "prompt": "whats your fav movie?", "max_tokens": 10, "temperature": 0, diff --git a/test/e2e/epp/README.md b/test/e2e/epp/README.md index 584d8914..247e8b12 100644 --- a/test/e2e/epp/README.md +++ b/test/e2e/epp/README.md @@ -10,7 +10,7 @@ The end-to-end tests are designed to validate end-to-end Gateway API Inference E - [Go](https://golang.org/doc/install) installed on your machine. - [Make](https://www.gnu.org/software/make/manual/make.html) installed to run the end-to-end test target. -- A Hugging Face Hub token with access to the [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) model. +- A Hugging Face Hub token with access to the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model. ## Running the End-to-End Tests @@ -34,5 +34,5 @@ Follow these steps to run the end-to-end tests: make test-e2e ``` - The test suite prints details for each step. Note that the `vllm-llama2-7b-pool` model server deployment + The test suite prints details for each step. Note that the `vllm-llama3-8b-instruct-pool` model server deployment may take several minutes to report an `Available=True` status due to the time required for bootstraping. diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go index 92521bf7..f9dea1cc 100644 --- a/test/e2e/epp/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -57,15 +57,15 @@ const ( // TODO [danehans]: Must be "default" until https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/227 is fixed nsName = "default" // modelServerName is the name of the model server test resources. - modelServerName = "vllm-llama2-7b" + modelServerName = "vllm-llama3-8b-instruct" // modelName is the test model name. - modelName = "tweet-summary" + modelName = "food-review" // envoyName is the name of the envoy proxy test resources. envoyName = "envoy" // envoyPort is the listener port number of the test envoy proxy. envoyPort = "8081" // inferExtName is the name of the inference extension test resources. - inferExtName = "vllm-llama2-7b-epp" + inferExtName = "vllm-llama3-8b-instruct-epp" // clientManifest is the manifest for the client test resources. clientManifest = "../../testdata/client.yaml" // modelServerSecretManifest is the manifest for the model server secret resource. diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index b12925ed..8e02aca4 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -1198,42 +1198,42 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false}, }, }, { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} data: [DONE]`, ), EndOfStream: false}, @@ -1300,7 +1300,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { BodyMutation: &extProcPb.BodyMutation{ Mutation: &extProcPb.BodyMutation_StreamedResponse{ StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false, }, }, @@ -1316,7 +1316,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { BodyMutation: &extProcPb.BodyMutation{ Mutation: &extProcPb.BodyMutation_StreamedResponse{ StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false, }, }, @@ -1332,7 +1332,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { BodyMutation: &extProcPb.BodyMutation{ Mutation: &extProcPb.BodyMutation_StreamedResponse{ StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false, }, }, @@ -1348,7 +1348,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { BodyMutation: &extProcPb.BodyMutation{ Mutation: &extProcPb.BodyMutation_StreamedResponse{ StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false, }, }, @@ -1364,7 +1364,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { BodyMutation: &extProcPb.BodyMutation{ Mutation: &extProcPb.BodyMutation_StreamedResponse{ StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), EndOfStream: false, }, }, @@ -1380,7 +1380,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { BodyMutation: &extProcPb.BodyMutation{ Mutation: &extProcPb.BodyMutation_StreamedResponse{ StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} data: [DONE]`, ), EndOfStream: false, @@ -1507,7 +1507,7 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac // TODO: this should be consistent with the inference pool podLabels := map[string]string{ - "app": "vllm-llama2-7b-pool", + "app": "vllm-llama3-8b-instruct-pool", } for pod := range podAndMetrics { @@ -1602,7 +1602,7 @@ func BeforeSuite() func() { // Init runtime. ctrl.SetLogger(logger) - mgr, err := server.NewManagerWithOptions(cfg, managerTestOptions("default", "vllm-llama2-7b-pool")) + mgr, err := server.NewManagerWithOptions(cfg, managerTestOptions("default", "vllm-llama3-8b-instruct-pool")) if err != nil { logutil.Fatal(logger, err, "Failed to create controller manager") } @@ -1615,7 +1615,7 @@ func BeforeSuite() func() { serverRunner.TestPodMetricsClient = &backendmetrics.FakePodMetricsClient{} pmf := backendmetrics.NewPodMetricsFactory(serverRunner.TestPodMetricsClient, 10*time.Millisecond) // Adjust from defaults - serverRunner.PoolName = "vllm-llama2-7b-pool" + serverRunner.PoolName = "vllm-llama3-8b-instruct-pool" serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf) serverRunner.SecureServing = false diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml index 2598428c..fc32b5aa 100644 --- a/test/testdata/envoy.yaml +++ b/test/testdata/envoy.yaml @@ -100,7 +100,7 @@ data: grpc_service: envoy_grpc: cluster_name: ext_proc - authority: vllm-llama2-7b-epp.default:9002 + authority: vllm-llama3-8b-instruct-epp.default:9002 timeout: 10s processing_mode: request_header_mode: SEND @@ -194,7 +194,7 @@ data: - endpoint: address: socket_address: - address: vllm-llama2-7b-epp.default + address: vllm-llama3-8b-instruct-epp.default port_value: 9002 health_status: HEALTHY load_balancing_weight: 1 diff --git a/test/testdata/inferencepool-with-model-hermetic.yaml b/test/testdata/inferencepool-with-model-hermetic.yaml index 36b6e539..d006e047 100644 --- a/test/testdata/inferencepool-with-model-hermetic.yaml +++ b/test/testdata/inferencepool-with-model-hermetic.yaml @@ -1,12 +1,12 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: - name: vllm-llama2-7b-pool + name: vllm-llama3-8b-instruct-pool namespace: default spec: targetPortNumber: 8000 selector: - app: vllm-llama2-7b-pool + app: vllm-llama3-8b-instruct-pool extensionRef: name: epp --- @@ -19,7 +19,7 @@ spec: modelName: sql-lora criticality: Critical poolRef: - name: vllm-llama2-7b-pool + name: vllm-llama3-8b-instruct-pool targetModels: - name: sql-lora-1fdg2 weight: 100 @@ -32,7 +32,7 @@ metadata: spec: modelName: sql-lora-sheddable poolRef: - name: vllm-llama2-7b-pool + name: vllm-llama3-8b-instruct-pool targetModels: - name: sql-lora-1fdg3 weight: 100 @@ -46,7 +46,7 @@ spec: modelName: my-model criticality: Critical poolRef: - name: vllm-llama2-7b-pool + name: vllm-llama3-8b-instruct-pool targetModels: - name: my-model-12345 weight: 100 @@ -60,4 +60,4 @@ spec: modelName: direct-model criticality: Critical poolRef: - name: vllm-llama2-7b-pool \ No newline at end of file + name: vllm-llama3-8b-instruct-pool \ No newline at end of file diff --git a/tools/dynamic-lora-sidecar/deployment.yaml b/tools/dynamic-lora-sidecar/deployment.yaml index 9e9fc130..0a20ec66 100644 --- a/tools/dynamic-lora-sidecar/deployment.yaml +++ b/tools/dynamic-lora-sidecar/deployment.yaml @@ -32,7 +32,7 @@ spec: nvidia.com/gpu : 1 command: ["/bin/sh", "-c"] args: - - vllm serve meta-llama/Llama-2-7b-hf + - vllm serve meta-llama/Llama-3.1-8B-Instruct - --host=0.0.0.0 - --port=8000 - --tensor-parallel-size=1 @@ -111,17 +111,17 @@ data: port: modelServerPort ensureExist: models: - - base-model: meta-llama/Llama-2-7b-hf + - base-model: meta-llama/Llama-3.1-8B-Instruct id: sql-lora-v1 source: yard1/llama-2-7b-sql-lora-test - - base-model: meta-llama/Llama-2-7b-hf + - base-model: meta-llama/Llama-3.1-8B-Instruct id: sql-lora-v3 source: yard1/llama-2-7b-sql-lora-test - - base-model: meta-llama/Llama-2-7b-hf + - base-model: meta-llama/Llama-3.1-8B-Instruct id: sql-lora-v4 source: yard1/llama-2-7b-sql-lora-test ensureNotExist: models: - - base-model: meta-llama/Llama-2-7b-hf + - base-model: meta-llama/Llama-3.1-8B-Instruct id: sql-lora-v2 source: yard1/llama-2-7b-sql-lora-test \ No newline at end of file diff --git a/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py b/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py index 738c7449..6f7e447f 100644 --- a/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py +++ b/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py @@ -12,17 +12,17 @@ "ensureExist": { "models": [ { - "base-model": "meta-llama/Llama-2-7b-hf", + "base-model": "meta-llama/Llama-3.1-8B-Instruct", "id": "sql-lora-v1", "source": "yard1/llama-2-7b-sql-lora-test", }, { - "base-model": "meta-llama/Llama-2-7b-hf", + "base-model": "meta-llama/Llama-3.1-8B-Instruct", "id": "sql-lora-v3", "source": "yard1/llama-2-7b-sql-lora-test", }, { - "base-model": "meta-llama/Llama-2-7b-hf", + "base-model": "meta-llama/Llama-3.1-8B-Instruct", "id": "already_exists", "source": "yard1/llama-2-7b-sql-lora-test", }, @@ -31,17 +31,17 @@ "ensureNotExist": { "models": [ { - "base-model": "meta-llama/Llama-2-7b-hf", + "base-model": "meta-llama/Llama-3.1-8B-Instruct", "id": "sql-lora-v2", "source": "yard1/llama-2-7b-sql-lora-test", }, { - "base-model": "meta-llama/Llama-2-7b-hf", + "base-model": "meta-llama/Llama-3.1-8B-Instruct", "id": "sql-lora-v3", "source": "yard1/llama-2-7b-sql-lora-test", }, { - "base-model": "meta-llama/Llama-2-7b-hf", + "base-model": "meta-llama/Llama-3.1-8B-Instruct", "id": "to_remove", "source": "yard1/llama-2-7b-sql-lora-test", }, @@ -67,7 +67,7 @@ "object": "model", "created": 1729693000, "owned_by": "vllm", - "root": "meta-llama/Llama-2-7b-hf", + "root": "meta-llama/Llama-3.1-8B-Instruct", "parent": None, "max_model_len": 4096, },