diff --git a/mkdocs.yml b/mkdocs.yml index c9bc30e0..a024c16d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -56,6 +56,7 @@ nav: - Guides: - User Guides: - Getting started: guides/index.md + - Adapter Rollout: guides/adapter-rollout.md - Implementer's Guide: guides/implementers.md - Reference: - API Reference: reference/spec.md diff --git a/pkg/manifests/inferencemodel.yaml b/pkg/manifests/inferencemodel.yaml index 0085a89d..2a292c16 100644 --- a/pkg/manifests/inferencemodel.yaml +++ b/pkg/manifests/inferencemodel.yaml @@ -1,21 +1,12 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: InferenceModel metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize name: inferencemodel-sample spec: modelName: tweet-summary criticality: Critical poolRef: - # this is the default val: - group: inference.networking.x-k8s.io - # this is the default val: - kind: InferencePool name: vllm-llama2-7b-pool targetModels: - - name: tweet-summary-0 - weight: 50 - name: tweet-summary-1 - weight: 50 + weight: 100 diff --git a/pkg/manifests/vllm/deployment-with-syncer.yaml b/pkg/manifests/vllm/deployment-with-syncer.yaml deleted file mode 100644 index d6110f4b..00000000 --- a/pkg/manifests/vllm/deployment-with-syncer.yaml +++ /dev/null @@ -1,145 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: vllm-llama2-7b-pool -spec: - selector: - app: vllm-llama2-7b-pool - ports: - - protocol: TCP - port: 8000 - targetPort: 8000 - type: ClusterIP ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-llama2-7b-pool -spec: - replicas: 3 - selector: - matchLabels: - app: vllm-llama2-7b-pool - template: - metadata: - labels: - app: vllm-llama2-7b-pool - spec: - containers: - - name: lora - image: "vllm/vllm-openai:latest" - imagePullPolicy: Always - command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] - args: - - "--model" - - "meta-llama/Llama-2-7b-hf" - - "--tensor-parallel-size" - - "1" - - "--port" - - "8000" - - "--enable-lora" - - "--max-loras" - - "4" - - "--max-cpu-loras" - - "12" - - "--lora-modules" - - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' - - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' - env: - - name: PORT - value: "8000" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token - key: token - - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING - value: "true" - ports: - - containerPort: 8000 - name: http - protocol: TCP - livenessProbe: - failureThreshold: 240 - httpGet: - path: /health - port: http - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 600 - httpGet: - path: /health - port: http - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 1 - resources: - limits: - nvidia.com/gpu: 1 - requests: - nvidia.com/gpu: 1 - volumeMounts: - - mountPath: /data - name: data - - mountPath: /dev/shm - name: shm - - name: adapters - mountPath: "/adapters" - initContainers: - - name: lora-adapter-syncer - tty: true - stdin: true - image: us-central1-docker.pkg.dev/ahg-gke-dev/jobset2/lora-syncer:6dc97be - restartPolicy: Always - imagePullPolicy: Always - env: - - name: DYNAMIC_LORA_ROLLOUT_CONFIG - value: "/config/configmap.yaml" - volumeMounts: # DO NOT USE subPath - - name: config-volume - mountPath: /config - restartPolicy: Always - schedulerName: default-scheduler - terminationGracePeriodSeconds: 30 - volumes: - - name: data - emptyDir: {} - - name: shm - emptyDir: - medium: Memory - - name: adapters - emptyDir: {} - - name: config-volume - configMap: - name: dynamic-lora-config - ---- - -apiVersion: v1 -kind: ConfigMap -metadata: - name: dynamic-lora-config -data: - configmap.yaml: | - vLLMLoRAConfig: - name: sql-loras-llama - port: 8000 - ensureExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-0 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - ensureNotExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-2 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm \ No newline at end of file diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml index 1d115f4d..a54d99b3 100644 --- a/pkg/manifests/vllm/deployment.yaml +++ b/pkg/manifests/vllm/deployment.yaml @@ -1,16 +1,3 @@ -apiVersion: v1 -kind: Service -metadata: - name: vllm-llama2-7b-pool -spec: - selector: - app: vllm-llama2-7b-pool - ports: - - protocol: TCP - port: 8000 - targetPort: 8000 - type: ClusterIP ---- apiVersion: apps/v1 kind: Deployment metadata: @@ -39,7 +26,7 @@ spec: - "8000" - "--enable-lora" - "--max-loras" - - "4" + - "2" - "--max-cpu-loras" - "12" - "--lora-modules" @@ -53,6 +40,8 @@ spec: secretKeyRef: name: hf-token key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" ports: - containerPort: 8000 name: http @@ -89,6 +78,19 @@ spec: name: shm - name: adapters mountPath: "/adapters" + initContainers: + - name: lora-adapter-syncer + tty: true + stdin: true + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main + restartPolicy: Always + imagePullPolicy: Always + env: + - name: DYNAMIC_LORA_ROLLOUT_CONFIG + value: "/config/configmap.yaml" + volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths + - name: config-volume + mountPath: /config restartPolicy: Always schedulerName: default-scheduler terminationGracePeriodSeconds: 30 @@ -100,3 +102,22 @@ spec: medium: Memory - name: adapters emptyDir: {} + - name: config-volume + configMap: + name: vllm-llama2-7b-adapters +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama2-7b-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama2-7b + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md new file mode 100644 index 00000000..9ce8c3a4 --- /dev/null +++ b/site-src/guides/adapter-rollout.md @@ -0,0 +1,133 @@ +# Adapter Rollout + +The goal of this guide is to demonstrate how to rollout a new adapter version. + +## **Prerequisites** + +Follow the steps in the [main guide](index.md) + + +## **Safely rollout v2 adapter** + +### Load the new adapter version to the model servers + +This guide leverages the LoRA syncer sidecar to dynamically manage adapters within a vLLM deployment, enabling users to add or remove them through a shared ConfigMap. + + +Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version. + + +```bash + kubectl edit configmap vllm-llama2-7b-adapters +``` + +Change the ConfigMap to match the following (note the new entry under models): + +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: vllm-llama2-7b-adapters + data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama2-7b-adapters + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: mahimairaja/tweet-summarization-llama-2-finetuned +``` + +The new adapter version is applied to the model servers live, without requiring a restart. + + +### Direct traffic to the new adapter version + +Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for tweet-summary model will be sent to the new ***tweet-summary-2*** adapter. + + +```bash + kubectl edit inferencemodel tweet-summary +``` + +Change the targetModels list in InferenceModel to match the following: + + +```yaml +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: InferenceModel +metadata: + name: inferencemodel-sample +spec: + modelName: tweet-summary + criticality: Critical + poolRef: + name: vllm-llama2-7b-pool + targetModels: + - name: tweet-summary-1 + weight: 90 + - name: tweet-summary-2 + weight: 10 + +``` + +The above configuration means one in every ten requests should be sent to the new version. Try it out: + +1. Get the gateway IP: +```bash +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=8081 +``` + +2. Send a few requests as follows: +```bash +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "tweet-summary", +"prompt": "Write as if you were a critic: San Francisco", +"max_tokens": 100, +"temperature": 0 +}' +``` + +### Finish the rollout + + +Modify the InferenceModel to direct 100% of the traffic to the latest version of the adapter. + +```yaml +model: + name: tweet-summary + targetModels: + targetModelName: tweet-summary-2 + weight: 100 +``` + +Unload the older versions from the servers by updating the LoRA syncer ConfigMap to list the older version under the `ensureNotExist` list: + +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: dynamic-lora-config + data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: mahimairaja/tweet-summarization-llama-2-finetuned + ensureNotExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm +``` + +With this, all requests should be served by the new adapter version. diff --git a/site-src/guides/dynamic-lora.md b/site-src/guides/dynamic-lora.md deleted file mode 100644 index ef3c2b0f..00000000 --- a/site-src/guides/dynamic-lora.md +++ /dev/null @@ -1,93 +0,0 @@ -# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm - -The goal of this guide is to get a single InferencePool running with vLLM and demonstrate use of dynamic lora updating! - -### Requirements - - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - - A cluster with: - - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, - you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. - -### Steps - -1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar ** - [Redeploy the vLLM deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml) - -Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md) - - -### Safely rollout v2 adapter - -1. Update the LoRA syncer ConfigMap to make the new adapter version available on the model servers. - -```yaml - apiVersion: v1 - kind: ConfigMap - metadata: - name: dynamic-lora-config - data: - configmap.yaml: | - vLLMLoRAConfig: - name: sql-loras-llama - port: 8000 - ensureExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-0 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-2 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm -2. Configure a canary rollout with traffic split using LLMService. In this example, 40% of traffic for tweet-summary model will be sent to the ***tweet-summary-2*** adapter . - -```yaml -model: - name: tweet-summary - targetModels: - targetModelName: tweet-summary-0 - weight: 20 - targetModelName: tweet-summary-1 - weight: 40 - targetModelName: tweet-summary-2 - weight: 40 - -``` - -3. Finish rollout by setting the traffic to the new version 100%. -```yaml -model: - name: tweet-summary - targetModels: - targetModelName: tweet-summary-2 - weight: 100 -``` - -4. Remove v1 from dynamic lora configmap. -```yaml - apiVersion: v1 - kind: ConfigMap - metadata: - name: dynamic-lora-config - data: - configmap.yaml: | - vLLMLoRAConfig: - name: sql-loras-llama - port: 8000 - ensureExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-2 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - ensureNotExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 - source: gs://[HUGGING FACE PATH] - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-0 - source: gs://[HUGGING FACE PATH] -``` diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 2cc971c6..b9c38d87 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -2,16 +2,16 @@ This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! -### Requirements +## **Prerequisites** - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - A cluster with: - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. -### Steps +## **Steps** -1. **Deploy Sample Model Server** +### Deploy Sample Model Server Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. @@ -20,22 +20,20 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml ``` +### Install the Inference Extension CRDs - - -1. **Install the Inference Extension CRDs:** - - ```sh + ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v0.1.0/manifests.yaml -1. **Deploy InferenceModel** +### Deploy InferenceModel Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml ``` -1. **Update Envoy Gateway Config to enable Patch Policy** + +### Update Envoy Gateway Config to enable Patch Policy** Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: ```bash @@ -43,7 +41,8 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system ``` Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. -1. **Deploy Gateway** + +### Deploy Gateway ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml @@ -56,26 +55,28 @@ This quickstart guide is intended for engineers familiar with k8s and model serv NAME CLASS ADDRESS PROGRAMMED AGE inference-gateway inference-gateway True 22s ``` -1. **Deploy the Inference Extension and InferencePool** +### Deploy the Inference Extension and InferencePool ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml ``` -1. **Deploy Envoy Gateway Custom Policies** +### Deploy Envoy Gateway Custom Policies ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/extension_policy.yaml kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/patch_policy.yaml ``` > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. -1. **OPTIONALLY**: Apply Traffic Policy + +### **OPTIONALLY**: Apply Traffic Policy For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/traffic_policy.yaml ``` -1. **Try it out** + +### Try it out Wait until the gateway is ready. @@ -89,4 +90,4 @@ This quickstart guide is intended for engineers familiar with k8s and model serv "max_tokens": 100, "temperature": 0 }' - ``` \ No newline at end of file + ```