vllm-project · mgoin · Feb 20, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025
@@ -3,12 +3,14 @@
 import json
 import re
 import weakref
+from unittest import mock
 
 import jsonschema
 import pytest
 
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
+from vllm.model_executor import guided_decoding as guided_decoding_module
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
@@ -277,6 +279,24 @@
                      guided_options_request=dict(guided_regex=sample_regex))
 
 
+@pytest.mark.skip_global_cleanup
+def test_disable_guided_decoding_fallback(sample_regex, llm):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         regex=sample_regex,
+                                         backend="xgrammar"))
+
+    with mock.patch.object(guided_decoding_module,
+                           "VLLM_DISABLE_GUIDED_DECODING_FALLBACK", True):
+        with pytest.raises(
+                ValueError,
+                match="xgrammar does not support regex guided decoding"):
+            llm.generate(prompts="This should fail",
+                         sampling_params=sampling_params,
+                         use_tqdm=True)
+
+
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 def test_guided_json_object(llm, guided_decoding_backend: str):

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -89,6 +89,7 @@
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
     VLLM_CUDART_SO_PATH: Optional[str] = None
+    VLLM_DISABLE_GUIDED_DECODING_FALLBACK: bool = False
 
 
 def get_default_cache_root():
@@ -585,6 +586,8 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # specify the path through environment variable VLLM_CUDART_SO_PATH.
     "VLLM_CUDART_SO_PATH":
     lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
+    "VLLM_DISABLE_GUIDED_DECODING_FALLBACK":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_GUIDED_DECODING_FALLBACK", "0"))),
 }
 
 # end-env-vars-definition

@@ -4,6 +4,7 @@
 
 from typing import TYPE_CHECKING
 
+from vllm.envs import VLLM_DISABLE_GUIDED_DECODING_FALLBACK
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
@@ -22,47 +23,56 @@
 
 def maybe_backend_fallback(
         guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
+
+    def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
+                          fallback: str) -> None:
+        """Change the backend to the specified fallback with a warning log, 
+        or raise a ValueError if guided decoding fallback is not enabled."""
+        if VLLM_DISABLE_GUIDED_DECODING_FALLBACK:
+            raise ValueError(message)
+
+        logger.warning("%s Falling back to use %s instead.", message, fallback)
+        guided_params.backend = fallback
+
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
     if guided_params.backend == "lm-format-enforcer":
         if guided_params.grammar is not None:
-            logger.warning(
-                "lm-format-enforcer does not support grammar guided decoding. "
-                "Falling back to use xgrammar instead.")
-            guided_params.backend = "xgrammar"
+            fallback_or_error(
+                guided_params,
+                "lm-format-enforcer does not support grammar guided decoding.",
+                "xgrammar")
 
         # lm-format-enforcer doesn't support some JSON schema features
         elif (guided_params.json is not None
               and has_lmf_unsupported_json_features(guided_params.json)):
-            logger.warning(
+            fallback_or_error(
+                guided_params,
                 "lm-format-enforcer does not support advanced JSON schema "
-                "features like patterns or numeric ranges. "
-                "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+                "features like patterns or numeric ranges.", "outlines")
 
     if guided_params.backend == "xgrammar":
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (
             xgr_installed)
         # xgrammar only has x86 wheels for linux, fallback to outlines
         from vllm.platforms import current_platform
         if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
-            logger.warning("xgrammar is only supported on x86 CPUs. "
-                           "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+            fallback_or_error(guided_params,
+                              "xgrammar is only supported on x86 CPUs.",
+                              "outlines")
 
         # xgrammar doesn't support regex, fallback to outlines
         if guided_params.regex is not None:
-            logger.warning("xgrammar does not support regex guided decoding. "
-                           "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+            fallback_or_error(
+                guided_params,
+                "xgrammar does not support regex guided decoding.", "outlines")
 
         # xgrammar doesn't support some JSON schema features
         elif (guided_params.json is not None
               and has_xgrammar_unsupported_json_features(guided_params.json)):
-            logger.warning(
+            fallback_or_error(
+                guided_params,
                 "xgrammar does not support advanced JSON schema features like "
-                "patterns or numeric ranges. "
-                "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+                "enums, patterns or numeric ranges.", "outlines")
 
         # xgrammar only supports GBNF grammars, so we must convert Lark.
         # We must check if the grammar is likely Lark and if that
@@ -72,25 +82,23 @@ def maybe_backend_fallback(
             try:
                 convert_lark_to_gbnf(guided_params.grammar)
             except Exception:
-                logger.warning(
+                fallback_or_error(
+                    guided_params,
                     "xgrammar does not support Lark grammars and the "
-                    "grammar failed to convert to GBNF. "
-                    "Falling back to use outlines instead.")
-                guided_params.backend = "outlines"
+                    "grammar failed to convert to GBNF.", "outlines")
 
         # If the xgrammar module cannot be imported successfully,
         # we should still allow users to use guided decoding with a fallback.
         elif not xgr_installed:
-            logger.warning("xgrammar module cannot be imported successfully. "
-                           "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+            fallback_or_error(
+                guided_params,
+                "xgrammar module cannot be imported successfully.", "outlines")
 
     if (guided_params.backend == "outlines"
             and guided_params.json_object is not None):
         # outlines doesn't support json_object, fallback to xgrammar
-        logger.warning("outlines does not support json_object. "
-                       "Falling back to use xgrammar instead.")
-        guided_params.backend = "xgrammar"
+        fallback_or_error(guided_params,
+                          "outlines does not support json_object.", "xgrammar")
 
     return guided_params