Configure gpu-deployment.yaml to force vLLM v1 with LoRA

smarterclayton · smarterclayton · commit d1669b8e5c6e · 2025-03-25T13:33:59.000-04:00
Until 0.8.3 is released, using the LoRA flag disables automatic
v1 opt-in.
diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml
@@ -33,6 +33,10 @@ spec:
           - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
+            # Enabling LoRA support temporarily disables automatic v1, we want to force it on
+            # until 0.8.3 vLLM is released.
+            - name: VLLM_USE_V1
+              value: "1"
             - name: PORT
               value: "8000"
             - name: HUGGING_FACE_HUB_TOKEN