Skip to content

Commit 1fd8998

Browse files
committed
Default to streaming mode
1 parent 4aa1019 commit 1fd8998

File tree

4 files changed

+48
-35
lines changed

4 files changed

+48
-35
lines changed

config/charts/inferencepool/README.md

+13-3
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,27 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) depl
55

66
## Install
77

8-
To install an InferencePool named `pool-1` that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command:
8+
To install an InferencePool named `vllm-llama2-7b` that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command:
99

1010
```txt
11-
$ helm install pool-1 ./config/charts/inferencepool \
12-
--set inferencePool.name=pool-1 \
11+
$ helm install vllm-llama2-7b ./config/charts/inferencepool \
12+
--set inferencePool.name=vllm-llama2-7b \
1313
--set inferencePool.selector.app=vllm-llama2-7b \
1414
--set inferencePool.targetPortNumber=8000
1515
```
1616

1717
where `inferencePool.targetPortNumber` is the pod that vllm backends served on and `inferencePool.selector` is the selector to match the vllm backends.
1818

19+
To install via the latest published chart in staging (--version v0 indicates latest dev version), you can run the following command:
20+
21+
```txt
22+
$ helm install vllm-llama2-7b \
23+
--set inferencePool.name=vllm-llama2-7b \
24+
--set inferencePool.selector.app=vllm-llama2-7b \
25+
--set inferencePool.targetPortNumber=8000 \
26+
oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
27+
```
28+
1929
## Uninstall
2030

2131
Run the following command to uninstall the chart:

config/charts/inferencepool/templates/inferencepool.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ spec:
4949
- "9003"
5050
- -metricsPort
5151
- "9090"
52+
env:
53+
- name: USE_STREAMING
54+
value: "true"
5255
ports:
5356
- name: grpc
5457
containerPort: 9002

config/manifests/gateway/patch_policy.yaml

+31-31
Original file line numberDiff line numberDiff line change
@@ -54,37 +54,37 @@ spec:
5454
op: replace
5555
path: "/virtual_hosts/0/routes/0/route/cluster"
5656
value: original_destination_cluster
57-
# Uncomment the below to enable full duplex streaming
58-
# - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
59-
# name: "default/inference-gateway/llm-gw"
60-
# operation:
61-
# op: add
62-
# path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_body_mode"
63-
# value: FULL_DUPLEX_STREAMED
64-
# - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
65-
# name: "default/inference-gateway/llm-gw"
66-
# operation:
67-
# op: add
68-
# path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_trailer_mode"
69-
# value: SEND
70-
# - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
71-
# name: "default/inference-gateway/llm-gw"
72-
# operation:
73-
# op: add
74-
# path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_body_mode"
75-
# value: FULL_DUPLEX_STREAMED
76-
# - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
77-
# name: "default/inference-gateway/llm-gw"
78-
# operation:
79-
# op: replace
80-
# path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_trailer_mode"
81-
# value: SEND
82-
# - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
83-
# name: "default/inference-gateway/llm-gw"
84-
# operation:
85-
# op: replace
86-
# path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode"
87-
# value: SEND
57+
# Comment the below to disable full duplex streaming
58+
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
59+
name: "default/inference-gateway/llm-gw"
60+
operation:
61+
op: add
62+
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_body_mode"
63+
value: FULL_DUPLEX_STREAMED
64+
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
65+
name: "default/inference-gateway/llm-gw"
66+
operation:
67+
op: add
68+
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_trailer_mode"
69+
value: SEND
70+
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
71+
name: "default/inference-gateway/llm-gw"
72+
operation:
73+
op: add
74+
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_body_mode"
75+
value: FULL_DUPLEX_STREAMED
76+
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
77+
name: "default/inference-gateway/llm-gw"
78+
operation:
79+
op: replace
80+
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_trailer_mode"
81+
value: SEND
82+
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
83+
name: "default/inference-gateway/llm-gw"
84+
operation:
85+
op: replace
86+
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode"
87+
value: SEND
8888
---
8989
apiVersion: gateway.envoyproxy.io/v1alpha1
9090
kind: EnvoyExtensionPolicy

config/manifests/inferencepool.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ spec:
5656
- "9003"
5757
env:
5858
- name: USE_STREAMING
59-
value: "false"
59+
value: "true"
6060
ports:
6161
- containerPort: 9002
6262
- containerPort: 9003

0 commit comments

Comments
 (0)