Skip to content

Commit 3e90264

Browse files
committed
Updating llama 2 7b to llama 3.1 8b Instruct and adding new LoRA adapters
1 parent 5480639 commit 3e90264

22 files changed

+127
-128
lines changed

config/charts/inferencepool/README.md

+7-7
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) depl
55

66
## Install
77

8-
To install an InferencePool named `vllm-llama2-7b` that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command:
8+
To install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
99

1010
```txt
11-
$ helm install vllm-llama2-7b ./config/charts/inferencepool \
12-
--set inferencePool.name=vllm-llama2-7b \
13-
--set inferencePool.modelServers.matchLabels.app=vllm-llama2-7b \
11+
$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool \
12+
--set inferencePool.name=vllm-llama3-8b-instruct \
13+
--set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
1414
--set inferencePool.targetPortNumber=8000
1515
```
1616

@@ -19,9 +19,9 @@ where `inferencePool.targetPortNumber` is the pod that vllm backends served on a
1919
To install via the latest published chart in staging (--version v0 indicates latest dev version), you can run the following command:
2020

2121
```txt
22-
$ helm install vllm-llama2-7b \
23-
--set inferencePool.name=vllm-llama2-7b \
24-
--set inferencePool.modelServers.matchLabels.app=vllm-llama2-7b \
22+
$ helm install vllm-llama3-8b-instruct \
23+
--set inferencePool.name=vllm-llama3-8b-instruct \
24+
--set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
2525
--set inferencePool.targetPortNumber=8000 \
2626
oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
2727
```

config/charts/inferencepool/values.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ inferencePool:
1212
targetPortNumber: 8000
1313
# modelServers: # REQUIRED
1414
# matchLabels:
15-
# app: vllm-llama2-7b
15+
# app: vllm-llama3-8b-instruct

config/manifests/benchmark/benchmark.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ spec:
3131
- name: BENCHMARK_TIME_SECONDS
3232
value: '60'
3333
- name: TOKENIZER
34-
value: 'meta-llama/Llama-2-7b-hf'
34+
value: 'meta-llama/Llama-3.1-8B-Instruct'
3535
- name: MODELS
36-
value: 'meta-llama/Llama-2-7b-hf'
36+
value: 'meta-llama/Llama-3.1-8B-Instruct'
3737
- name: BACKEND
3838
value: vllm
3939
- name: PORT

config/manifests/gateway/patch_policy.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ spec:
9999
- backendRefs:
100100
- group: ""
101101
kind: Service
102-
name: vllm-llama2-7b-epp
102+
name: vllm-llama3-8b-instruct-epp
103103
port: 9002
104104
processingMode:
105105
allowModeOverride: true

config/manifests/inferencemodel.yaml

+7-7
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ kind: InferenceModel
33
metadata:
44
name: inferencemodel-sample
55
spec:
6-
modelName: tweet-summary
7-
criticality: Critical
6+
modelName: food-review
7+
criticality: Standard
88
poolRef:
9-
name: vllm-llama2-7b
9+
name: vllm-llama3-8b-instruct
1010
targetModels:
11-
- name: tweet-summary-1
11+
- name: food-review-1
1212
weight: 100
1313

1414
---
@@ -17,10 +17,10 @@ kind: InferenceModel
1717
metadata:
1818
name: inferencemodel-base-model
1919
spec:
20-
modelName: meta-llama/Llama-2-7b-hf
20+
modelName: meta-llama/Llama-3.1-8B-Instruct
2121
criticality: Critical
2222
poolRef:
23-
name: vllm-llama2-7b
23+
name: vllm-llama3-8b-instruct
2424

2525
---
2626
apiVersion: inference.networking.x-k8s.io/v1alpha2
@@ -31,4 +31,4 @@ spec:
3131
modelName: Qwen/Qwen2.5-1.5B-Instruct
3232
criticality: Critical
3333
poolRef:
34-
name: vllm-llama2-7b
34+
name: vllm-llama3-8b-instruct

config/manifests/inferencepool.yaml

+10-10
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,22 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2
22
kind: InferencePool
33
metadata:
44
labels:
5-
name: vllm-llama2-7b
5+
name: vllm-llama3-8b-instruct
66
spec:
77
targetPortNumber: 8000
88
selector:
9-
app: vllm-llama2-7b
9+
app: vllm-llama3-8b-instruct
1010
extensionRef:
11-
name: vllm-llama2-7b-epp
11+
name: vllm-llama3-8b-instruct-epp
1212
---
1313
apiVersion: v1
1414
kind: Service
1515
metadata:
16-
name: vllm-llama2-7b-epp
16+
name: vllm-llama3-8b-instruct-epp
1717
namespace: default
1818
spec:
1919
selector:
20-
app: vllm-llama2-7b-epp
20+
app: vllm-llama3-8b-instruct-epp
2121
ports:
2222
- protocol: TCP
2323
port: 9002
@@ -27,27 +27,27 @@ spec:
2727
apiVersion: apps/v1
2828
kind: Deployment
2929
metadata:
30-
name: vllm-llama2-7b-epp
30+
name: vllm-llama3-8b-instruct-epp
3131
namespace: default
3232
labels:
33-
app: vllm-llama2-7b-epp
33+
app: vllm-llama3-8b-instruct-epp
3434
spec:
3535
replicas: 1
3636
selector:
3737
matchLabels:
38-
app: vllm-llama2-7b-epp
38+
app: vllm-llama3-8b-instruct-epp
3939
template:
4040
metadata:
4141
labels:
42-
app: vllm-llama2-7b-epp
42+
app: vllm-llama3-8b-instruct-epp
4343
spec:
4444
containers:
4545
- name: epp
4646
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
4747
imagePullPolicy: Always
4848
args:
4949
- -poolName
50-
- "vllm-llama2-7b"
50+
- "vllm-llama3-8b-instruct"
5151
- -v
5252
- "4"
5353
- --zap-encoder

config/manifests/vllm/cpu-deployment.yaml

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
apiVersion: apps/v1
22
kind: Deployment
33
metadata:
4-
name: vllm-llama2-7b
4+
name: vllm-llama3-8b-instruct
55
spec:
66
replicas: 3
77
selector:
88
matchLabels:
9-
app: vllm-llama2-7b
9+
app: vllm-llama3-8b-instruct
1010
template:
1111
metadata:
1212
labels:
13-
app: vllm-llama2-7b
13+
app: vllm-llama3-8b-instruct
1414
spec:
1515
containers:
1616
- name: lora
@@ -26,8 +26,8 @@ spec:
2626
- "--max-loras"
2727
- "4"
2828
- "--lora-modules"
29-
- '{"name": "tweet-summary-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
30-
- '{"name": "tweet-summary-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
29+
- '{"name": "food-review-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
30+
- '{"name": "food-review-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
3131
env:
3232
- name: PORT
3333
value: "8000"
@@ -108,10 +108,10 @@ metadata:
108108
data:
109109
configmap.yaml: |
110110
vLLMLoRAConfig:
111-
name: vllm-llama2-7b
111+
name: vllm-llama3-8b-instruct
112112
port: 8000
113113
ensureExist:
114114
models:
115115
- base-model: Qwen/Qwen2.5-1.5B
116-
id: tweet-summary-1
116+
id: food-review-1
117117
source: SriSanth2345/Qwen-1.5B-Tweet-Generations

config/manifests/vllm/gpu-deployment.yaml

+15-16
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,34 @@
11
apiVersion: apps/v1
22
kind: Deployment
33
metadata:
4-
name: vllm-llama2-7b
4+
name: vllm-llama3-8b-instruct
55
spec:
66
replicas: 3
77
selector:
88
matchLabels:
9-
app: vllm-llama2-7b
9+
app: vllm-llama3-8b-instruct
1010
template:
1111
metadata:
1212
labels:
13-
app: vllm-llama2-7b
13+
app: vllm-llama3-8b-instruct
1414
spec:
1515
containers:
16-
- name: lora
16+
- name: vllm
1717
image: "vllm/vllm-openai:latest"
1818
imagePullPolicy: Always
1919
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
2020
args:
2121
- "--model"
22-
- "meta-llama/Llama-2-7b-hf"
22+
- "meta-llama/Llama-3.1-8B-Instruct"
2323
- "--tensor-parallel-size"
2424
- "1"
2525
- "--port"
2626
- "8000"
2727
- "--enable-lora"
2828
- "--max-loras"
29-
- "4"
29+
- "2"
3030
- "--max-cpu-loras"
3131
- "12"
32-
- "--lora-modules"
33-
- '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
34-
- '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
3532
env:
3633
# Enabling LoRA support temporarily disables automatic v1, we want to force it on
3734
# until 0.8.3 vLLM is released.
@@ -238,20 +235,22 @@ spec:
238235
emptyDir: {}
239236
- name: config-volume
240237
configMap:
241-
name: vllm-llama2-7b-adapters
238+
name: vllm-llama3.1-8b-adapters
242239
---
243240
apiVersion: v1
244241
kind: ConfigMap
245242
metadata:
246-
name: vllm-llama2-7b-adapters
243+
name: vllm-llama3.1-8b-adapters
247244
data:
248245
configmap.yaml: |
249246
vLLMLoRAConfig:
250-
name: vllm-llama2-7b
247+
name: vllm-llama3.1-8b-instruct
251248
port: 8000
252249
ensureExist:
253250
models:
254-
- base-model: meta-llama/Llama-2-7b-hf
255-
id: tweet-summary-1
256-
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
257-
251+
- base-model: meta-llama/Llama-3.1-8B-Instruct
252+
id: food-review
253+
source: Kawon/llama3.1-food-finetune_v14_r8
254+
- base-model: meta-llama/Llama-3.1-8B-Instruct
255+
id: cad-fabricator
256+
source: redcathode/fabricator

hack/test-e2e.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,14 @@ if [[ "$CURL_POD" == "true" ]]; then
124124
while [ $SECONDS -lt $end ]; do
125125
kubectl exec po/curl -- curl -i "$IP:$PORT/v1/completions" \
126126
-H 'Content-Type: application/json' \
127-
-d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
127+
-d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
128128
sleep 5
129129
done
130130
else
131131
while [ $SECONDS -lt $end ]; do
132132
curl -i "$IP:$PORT/v1/completions" \
133133
-H 'Content-Type: application/json' \
134-
-d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
134+
-d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
135135
sleep 5
136136
done
137137
fi

pkg/epp/datastore/datastore_test.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ func TestPool(t *testing.T) {
9797

9898
func TestModel(t *testing.T) {
9999
chatModel := "chat"
100-
tsModel := "tweet-summary"
100+
tsModel := "food-review"
101101
model1ts := testutil.MakeInferenceModel("model1").
102102
CreationTimestamp(metav1.Unix(1000, 0)).
103103
ModelName(tsModel).ObjRef()
@@ -126,7 +126,7 @@ func TestModel(t *testing.T) {
126126
wantModels []*v1alpha2.InferenceModel
127127
}{
128128
{
129-
name: "Add model1 with tweet-summary as modelName",
129+
name: "Add model1 with food-review as modelName",
130130
op: func(ds Datastore) bool {
131131
return ds.ModelSetIfOlder(model1ts)
132132
},
@@ -161,7 +161,7 @@ func TestModel(t *testing.T) {
161161
wantModels: []*v1alpha2.InferenceModel{model2ts},
162162
},
163163
{
164-
name: "Set model1 with the tweet-summary modelName, both models should exist",
164+
name: "Set model1 with the food-review modelName, both models should exist",
165165
existingModels: []*v1alpha2.InferenceModel{model2chat},
166166
op: func(ds Datastore) bool {
167167
return ds.ModelSetIfOlder(model1ts)
@@ -170,7 +170,7 @@ func TestModel(t *testing.T) {
170170
wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts},
171171
},
172172
{
173-
name: "Set model1 with the tweet-summary modelName, both models should exist",
173+
name: "Set model1 with the food-review modelName, both models should exist",
174174
existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts},
175175
op: func(ds Datastore) bool {
176176
return ds.ModelSetIfOlder(model1ts)

pkg/epp/handlers/response.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ func (s *Server) HandleResponseHeaders(
127127
"id": "cmpl-573498d260f2423f9e42817bbba3743a",
128128
"object": "text_completion",
129129
"created": 1732563765,
130-
"model": "meta-llama/Llama-2-7b-hf",
130+
"model": "meta-llama/Llama-3.1-8B-Instruct",
131131
"choices": [
132132
{
133133
"index": 0,
@@ -217,7 +217,7 @@ func (s *Server) HandleStreaming(
217217
}
218218

219219
// Example message if "stream_options": {"include_usage": "true"} is included in the request:
220-
// data: {"id":"...","object":"text_completion","created":1739400043,"model":"tweet-summary-0","choices":[],
220+
// data: {"id":"...","object":"text_completion","created":1739400043,"model":"food-review-0","choices":[],
221221
// "usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
222222
//
223223
// data: [DONE]

pkg/epp/handlers/response_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ const (
3131
"id": "cmpl-573498d260f2423f9e42817bbba3743a",
3232
"object": "text_completion",
3333
"created": 1732563765,
34-
"model": "meta-llama/Llama-2-7b-hf",
34+
"model": "meta-llama/Llama-3.1-8B-Instruct",
3535
"choices": [
3636
{
3737
"index": 0,
@@ -50,10 +50,10 @@ const (
5050
}
5151
`
5252

53-
streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"tweet-summary-0","choices":[],"usage":null}
53+
streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":null}
5454
`
5555

56-
streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"tweet-summary-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
56+
streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
5757
data: [DONE]
5858
`
5959
)

0 commit comments

Comments
 (0)