vllm-project · DarkLight1337 · Mar 8, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/vllm/config.py b/vllm/config.py
@@ -254,7 +254,7 @@
         override_neuron_config: Optional[Dict[str, Any]] = None,
         override_pooler_config: Optional["PoolerConfig"] = None,
         logits_processor_pattern: Optional[str] = None,
-        generation_config: Optional[str] = None,
+        generation_config: Optional[str] = "auto",
         enable_sleep_mode: bool = False,
         override_generation_config: Optional[Dict[str, Any]] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
@@ -950,7 +950,7 @@
         return self.multimodal_config
 
     def try_get_generation_config(self) -> Dict[str, Any]:
-        if self.generation_config is None or self.generation_config == "auto":
+        if self.generation_config in ("auto", "vllm"):
             config = try_get_generation_config(
                 self.hf_config_path or self.model,
                 trust_remote_code=self.trust_remote_code,
@@ -958,7 +958,7 @@
            )
        else:
            config = try_get_generation_config(
                self.generation_config,
                trust_remote_code=self.trust_remote_code,
            )

@@ -970,17 +970,14 @@
     def get_diff_sampling_param(self) -> Dict[str, Any]:
         """
         This method returns a dictionary containing the parameters
-        that differ from the default sampling parameters, but only
-        if `generation_config` is set. If `generation_config` is not
-        set, an empty dictionary is returned.
+        that differ from the default sampling parameters. If
+        `generation_config` is `"vllm"`, an empty dictionary is returned.
 
         Returns:
             Dict[str, Any]: A dictionary with the differing sampling
-            parameters if `generation_config` is set, otherwise an
-            empty dictionary.
+            parameters, if `generation_config` is `"vllm"` an empty dictionary.
         """
-        if self.generation_config is None:
-            # When generation_config is not set
+        if self.generation_config == "vllm":
             config = {}
         else:
             config = self.try_get_generation_config()

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1016,13 +1016,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             "--generation-config",
             type=nullable_str,
-            default=None,
+            default="auto",
             help="The folder path to the generation config. "
-            "Defaults to None, no generation config is loaded, vLLM defaults "
-            "will be used. If set to 'auto', the generation config will be "
-            "loaded from model path. If set to a folder path, the generation "
-            "config will be loaded from the specified folder path. If "
-            "`max_new_tokens` is specified in generation config, then "
+            "Defaults to 'auto', the generation config will be loaded from "
+            "model path. If set to 'vllm', no generation config is loaded, "
+            "vLLM defaults will be used. If set to a folder path, the "
+            "generation config will be loaded from the specified folder path. "
+            "If `max_new_tokens` is specified in generation config, then "
             "it sets a server-wide limit on the number of output tokens "
             "for all requests.")
 

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -271,6 +271,10 @@ def get_default_sampling_params(self) -> SamplingParams:
         diff_sampling_param = (
             self.llm_engine.model_config.get_diff_sampling_param())
         if diff_sampling_param:
+            source = self.llm_engine.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default sampling params from %s: %s", source,
+                        diff_sampling_param)
             return SamplingParams.from_optional(**diff_sampling_param)
         return SamplingParams()
 

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -108,8 +108,10 @@ def __init__(
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         diff_sampling_param = self.model_config.get_diff_sampling_param()
         if diff_sampling_param:
-            logger.info("Overwriting default chat sampling param with: %s",
-                        diff_sampling_param)
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default chat sampling params from %s: %s",
+                        source, diff_sampling_param)
 
     async def create_chat_completion(
         self,

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -53,9 +53,10 @@ def __init__(
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
         diff_sampling_param = self.model_config.get_diff_sampling_param()
         if diff_sampling_param:
-            logger.info(
-                "Overwriting default completion sampling param with: %s",
-                diff_sampling_param)
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default completion sampling params from %s: %s",
+                        source, diff_sampling_param)
 
     async def create_completion(
         self,