Skip to content

Commit d7fed6c

Browse files
committed
return error for cache_salt with V0 engine
Signed-off-by: Marko Rosenmueller <[email protected]>
1 parent c18460b commit d7fed6c

File tree

3 files changed

+42
-5
lines changed

3 files changed

+42
-5
lines changed

docs/source/design/v1/prefix_caching.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
9292

9393
With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
9494

95+
> **Note:** Cache isolation is not supported in engine V0.
96+
9597
## Data Structure
9698

9799
The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):

tests/entrypoints/openai/test_serving_chat.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,26 @@ def test_serving_chat_could_load_correct_generation_config():
273273
assert mock_engine.generate.call_args.args[1].temperature == 0.0
274274
assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
275275

276+
277+
def test_serving_chat_did_set_correct_cache_salt():
278+
mock_model_config = MockModelConfig()
279+
280+
mock_engine = MagicMock(spec=MQLLMEngineClient)
281+
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
282+
mock_engine.errored = False
283+
284+
# Initialize the serving chat
285+
models = OpenAIServingModels(engine_client=mock_engine,
286+
base_model_paths=BASE_MODEL_PATHS,
287+
model_config=mock_model_config)
288+
serving_chat = OpenAIServingChat(mock_engine,
289+
mock_model_config,
290+
models,
291+
response_role="assistant",
292+
chat_template=CHAT_TEMPLATE,
293+
chat_template_content_format="auto",
294+
request_logger=None)
295+
276296
# Test cache_salt
277297
req = ChatCompletionRequest(
278298
model=MODEL_NAME,

vllm/entrypoints/openai/protocol.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
ValidationInfo, field_validator, model_validator)
1414
from typing_extensions import TypeAlias
1515

16+
from vllm import envs
1617
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
1718
from vllm.logger import init_logger
1819
from vllm.pooling_params import PoolingParams
@@ -392,7 +393,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
392393
"environments. The salt should be random, protected from "
393394
"access by 3rd parties, and long enough to be "
394395
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
395-
"to 256 bit)."))
396+
"to 256 bit). Not supported by vLLM engine V0."))
396397

397398
# doc: end-chat-completion-extra-params
398399

@@ -704,6 +705,20 @@ def check_generation_prompt(cls, data):
704705
"`add_generation_prompt` to True.")
705706
return data
706707

708+
@model_validator(mode="before")
709+
@classmethod
710+
def check_cache_salt_support(cls, data):
711+
if data.get("cache_salt") is not None:
712+
if not envs.VLLM_USE_V1:
713+
raise ValueError(
714+
"Parameter 'cache_salt' is not supported with "
715+
"this instance of vLLM, which uses engine V0.")
716+
if not isinstance(data["cache_salt"], str) or len(
717+
data["cache_salt"]) == 0:
718+
raise ValueError("Parameter 'cache_salt' must be a "
719+
"non-empty string if provided.")
720+
return data
721+
707722

708723
class CompletionRequest(OpenAIBaseModel):
709724
# Ordered by official OpenAI API documentation
@@ -1599,9 +1614,9 @@ class TranscriptionRequest(OpenAIBaseModel):
15991614

16001615
# doc: begin-transcription-extra-params
16011616
stream: Optional[bool] = False
1602-
"""Custom field not present in the original OpenAI definition. When set,
1617+
"""Custom field not present in the original OpenAI definition. When set,
16031618
it will enable output to be streamed in a similar fashion as the Chat
1604-
Completion endpoint.
1619+
Completion endpoint.
16051620
"""
16061621
# Flattened stream option to simplify form data.
16071622
stream_include_usage: Optional[bool] = False
@@ -1619,15 +1634,15 @@ class TranscriptionRequest(OpenAIBaseModel):
16191634
"""
16201635

16211636
top_p: Optional[float] = None
1622-
"""Enables nucleus (top-p) sampling, where tokens are selected from the
1637+
"""Enables nucleus (top-p) sampling, where tokens are selected from the
16231638
smallest possible set whose cumulative probability exceeds `p`.
16241639
"""
16251640

16261641
top_k: Optional[int] = None
16271642
"""Limits sampling to the `k` most probable tokens at each step."""
16281643

16291644
min_p: Optional[float] = None
1630-
"""Filters out tokens with a probability lower than `min_p`, ensuring a
1645+
"""Filters out tokens with a probability lower than `min_p`, ensuring a
16311646
minimum likelihood threshold during sampling.
16321647
"""
16331648

0 commit comments

Comments
 (0)