vllm-project · WoosukKwon · Mar 7, 2025 · Jan 24, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -22,6 +22,7 @@
 import threading
 import time
 import traceback
+import types
 import uuid
 import warnings
 import weakref
@@ -2206,3 +2207,71 @@
     else:
         func = partial(method, obj)  # type: ignore
     return func(*args, **kwargs)
+
+
+class LazyLoader(types.ModuleType):
+    """
+    LazyLoader module borrowed from Tensorflow
+     https://github.com/tensorflow/tensorflow/blob/v2.2.0/tensorflow/python/util/lazy_loader.py
+     with a addition of "module caching". This will throw an
+     exception if module cannot be imported.
+
+    Lazily import a module, mainly to avoid pulling in large dependencies.
+     `contrib`, and `ffmpeg` are examples of modules that are large and not always
+     needed, and this allows them to only be loaded when they are used.
+    """
+
+    def __init__(
+        self,
+        local_name: str,
+        parent_module_globals: Dict[str, Any],
+        name: str,
+        warning: Optional[str] = None,
+        exc_msg: Optional[str] = None,
+        exc: Type[Exception] = Exception,
+    ):
+        self._local_name = local_name
+        self._parent_module_globals = parent_module_globals
+        self._warning = warning
+        self._exc_msg = exc_msg
+        self._exc = exc
+        self._module: types.ModuleType | None = None
+
+        super().__init__(str(name))
+
+    def _load(self) -> types.ModuleType:
+        """Load the module and insert it into the parent's globals."""
+        from . import warn_deprecated
+
+        # Import the target module and insert it into the parent's namespace
+        try:
+            module = importlib.import_module(self.__name__)
+            self._parent_module_globals[self._local_name] = module
+            # The additional add to sys.modules ensures library is actually loaded.
+            sys.modules[self._local_name] = module
+        except ModuleNotFoundError as err:
+            raise self._exc(f"{self._exc_msg} (reason: {err})") from None
+
+        # Emit a warning if one was specified
+        if self._warning:
+            warnings.warn(self._warning,
+                          category=DeprecationWarning,
+                          stacklevel=4)
+            # Make sure to only warn once.
+            self._warning = None
+
+        # Update this object's dict so that if someone keeps a reference to the
+        #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
+        #   that fail).
+        self.__dict__.update(module.__dict__)
+        return module
+
+    def __getattr__(self, item: Any) -> Any:
+        if self._module is None:
+            self._module = self._load()
+        return getattr(self._module, item)
+
+    def __dir__(self) -> List[str]:
+        if self._module is None:
+            self._module = self._load()
+        return dir(self._module)
diff --git a/vllm/v1/core/guided_decoding/__init__.py b/vllm/v1/core/guided_decoding/__init__.py
@@ -0,0 +1,192 @@
+from __future__ import annotations
+
+import copy
+import threading
+from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar, get_args
+
+from transformers import PreTrainedTokenizer
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.utils import LazyLoader
+from vllm.v1.request import GuidedDecodingKey, Request, RequestStatus
+
+from .grammar import Grammar
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+    from transformers import PreTrainedTokenizer
+    from typing_extensions import LiteralString
+
+    from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+
+    from .grammar import XGrammar
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+__all__ = ["Grammar", "GuidedDecodingManager"]
+
+
+@dataclass
+class GrammarCache:
+    value: Grammar | None
+    event: threading.Event
+
+
+T = TypeVar("T", bound=str)
+
+
+class GuidedDecodingManager(ABC, Generic[T]):
+
+    @abstractmethod
+    def initialize_cache(self, key: GuidedDecodingKey) -> Grammar:
+        ...
+
+    def flush(self):
+        with self._lock:
+            self.grammar_cache.clear()
+
+    def cache(self, request: Request):
+
+        def _executor_loop(request: Request):
+            key = request.guided_decoding_key
+            with self._lock:
+                cache_hit = False
+                if key in self.grammar_cache:
+                    cache_hit, entry = True, self.grammar_cache[key]
+                else:
+                    entry = GrammarCache(None, threading.Event())
+                    self.grammar_cache[key] = entry
+
+            if cache_hit:
+                entry.event.wait()
+            else:
+                entry.value = self.initialize_cache(key)
+                entry.event.set()
+            return copy.copy(entry.value) if entry.value else None
+
+        return self.executor.submit(_executor_loop, request)
+
+    def get(self, request: Request):
+        with self._lock:
+            entry = self.grammar_cache.get(request.guided_decoding_key)
+            if entry is None or not entry.event.is_set(): return None
+            return copy.copy(entry.value) if entry.value else None
+
+    def collect(self, request: Request):
+        if not request.use_guided_decoding: return False
+        request.grammar = self.get(request)
+        if not request.grammar:
+            request.grammar = self.cache(request)
+            request.status = RequestStatus.WAITING_FOR_FSM
+            return True
+        return False
+
+    @classmethod
+    def from_backend(cls,
+                     backend: LiteralString = "xgrammar",
+                     /,
+                     *,
+                     tokenizer_group: BaseTokenizerGroup,
+                     model_config: ModelConfig) -> GuidedDecodingManager[T]:
+        manager_cls = cls._registry.get(backend)
+        if manager_cls is None:
+            raise ValueError(
+                f"Backend '{backend}' not found in registry. Available backends: {list(cls._registry)}"
+            )
+        return manager_cls(tokenizer_group=tokenizer_group,
+                           model_config=model_config)
+
+    _registry: dict[str, type[GuidedDecodingManager[T]]] = {}
+    _backend: T
+
+    def __init__(self, *, tokenizer_group: BaseTokenizerGroup,
+                 model_config: ModelConfig):
+        self.model_config = model_config
+        self.tokenizer = tokenizer_group.get_lora_tokenizer(None)
+        self.grammar_cache: dict[GuidedDecodingKey, GrammarCache] = {}
+        self.executor = ThreadPoolExecutor()
+        self._lock = threading.Lock()
+
+    def __init_subclass__(cls, **kwargs: Any):
+        if not hasattr(cls, '__orig_bases__'):
+            raise TypeError(
+                f"{cls.__qualname__} must be subclass of GuidedDecodingManager"
+            )
+
+        backend = None
+        for base in cls.__orig_bases__:
+            if (origin := get_args(base)) and issubclass(
+                    base.__origin__, GuidedDecodingManager):
+                backend = get_args(origin[0])[0]
+                break
+
+        if backend is None:
+            raise TypeError(
+                f"Class {cls.__qualname__} must specify backend as a Literal type"
+            )
+
+        if backend in cls._registry:
+            name = cls._registry[backend].__qualname__
+            raise ValueError(
+                f"Backend '{backend}' is already registered to {name}")
+
+        # Set the backend value from the Literal type
+        cls._backend = backend
+        cls._registry[backend] = cls
+
+
+class XGrammarManager(GuidedDecodingManager[Literal["xgrammar"]]):
+    # cache GrammarCompiler instances based on given tokenizer
+    _compiler_cache: dict[str, xgr.GrammarCompiler] = {}
+    _compiler: xgr.GrammarCompiler | None = None
+
+    def initialize_cache(self, key: GuidedDecodingKey) -> XGrammar:
+        request_type, grammar_spec = key
+        compiler = XGrammarManager.get_compiler(self.tokenizer)
+        if request_type == "json":
+            if type(grammar_spec) is not str:
+                ctx = compiler.compile_builtin_json_grammar()
+            else:
+                ctx = compiler.compile_json_schema(grammar_spec)
+        elif request_type == "grammar":
+            ctx = compiler.compile_grammar(grammar_spec)
+        else:
+            raise ValueError("grammar is not of valid supported types.")
+        return Grammar.from_backend(
+            self._backend,
+            matcher=xgr.GrammarMatcher(ctx),
+            vocab_size=self.model_config.hf_text_config.vocab_size,
+            ctx=ctx)
+
+    def flush(self):
+        super().flush()
+        if self._compiler: self._compiler.clear_cache()
+        for compiler in self._compiler_cache.values():
+            compiler.clear_cache()
+        self._compiler_cache.clear()
+
+    @classmethod
+    def get_compiler(
+            cls,
+            tokenizer: PreTrainedTokenizer,
+            *,
+            max_threads: int = 8,
+            # passthrough to TokenizerInfo
+            vocab_size: int | None = None,
+            stop_token_ids: list[int] | int | None = None
+    ) -> xgr.GrammarCompiler:
+        cache_key = str(hash(tokenizer))
+        if cache_key not in cls._compiler_cache:
+            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+                tokenizer,
+                stop_token_ids=stop_token_ids,
+                vocab_size=vocab_size)
+            cls._compiler_cache[cache_key] = xgr.GrammarCompiler(
+                tokenizer_info, max_threads=max_threads)
+        return cls._compiler_cache[cache_key]