vllm-project
diff --git a/‎vllm/v1/core/guided_decoding/__init__.py
Lines changed: 50 additions & 22 deletions b/‎vllm/v1/core/guided_decoding/__init__.py
Lines changed: 50 additions & 22 deletions
diff --git a/‎vllm/v1/core/guided_decoding/grammar.py
Lines changed: 0 additions & 54 deletions b/‎vllm/v1/core/guided_decoding/grammar.py
Lines changed: 0 additions & 54 deletions
diff --git a/‎vllm/v1/core/scheduler.py
Lines changed: 19 additions & 58 deletions b/‎vllm/v1/core/scheduler.py
Lines changed: 19 additions & 58 deletions
diff --git a/‎vllm/v1/engine/core.py
Lines changed: 26 additions & 0 deletions b/‎vllm/v1/engine/core.py
Lines changed: 26 additions & 0 deletions
@@ -1,29 +1,59 @@
 from __future__ import annotations
 
-import copy, enum
+import copy
 import threading
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, TypeVar
+from typing import TYPE_CHECKING, Optional
 
+import torch
 import xgrammar as xgr
 
-from vllm.config import ModelConfig
-from vllm.logger import init_logger
+from vllm.config import VllmConfig
 from vllm.v1.request import GuidedDecodingKey, Request, RequestStatus
 
-from .grammar import Grammar
-
 if TYPE_CHECKING:
-    from typing_extensions import Self
-
     from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 
-    from .grammar import XGrammar
+__all__ = ["Grammar", "GuidedDecodingManager"]
+
 
-logger = init_logger(__name__)
+class Grammar:
+    # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string for jump-forward decoding
 
-__all__ = ["Grammar", "GuidedDecodingManager"]
+    def __init__(self, matcher: xgr.GrammarMatcher, vocab_size: int,
+                 ctx: xgr.CompiledGrammar) -> None:
+        self.matcher = matcher
+        self.vocab_size = vocab_size
+        self.ctx = ctx
+
+    def accept_token(self, token: int) -> bool:
+        # NOTE: accept_token will determines whether we accept this token
+        # and will also update the machine state
+        return self.matcher.accept_token(token)
+
+    def allocate_bitmask(self, batch_size: int,
+                         vocab_size: int) -> torch.Tensor:
+        return xgr.allocate_token_bitmask(batch_size, vocab_size)
+
+    # this should be ran in parallel with model decoding
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        self.matcher.fill_next_token_bitmask(bitmask, idx)
+
+    @staticmethod
+    def apply_bitmask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        xgr.apply_token_bitmask_inplace(logits, vocab_mask)
+
+    def reset(self):
+        self.matcher.reset()
+
+    def copy(self):
+        return Grammar(matcher=xgr.GrammarMatcher(self.ctx),
+                       vocab_size=self.vocab_size,
+                       ctx=self.ctx)
+
+    def __copy__(self):
+        return self.copy()
 
 
 @dataclass
@@ -74,20 +104,17 @@ def collect(self, request: Request):
             return True
         return False
 
-    def __init__(self, *, backend: str, tokenizer_group: BaseTokenizerGroup,
-                 model_config: ModelConfig):
-        self._backend = backend
-        self.model_config = model_config
+    def __init__(self, *, vllm_config: VllmConfig,
+                 tokenizer_group: BaseTokenizerGroup):
+        self.vllm_config = vllm_config
         self.tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.grammar_cache: dict[GuidedDecodingKey, GrammarCache] = {}
         self.executor = ThreadPoolExecutor()
         self._lock = threading.Lock()
-        cls._registry[backend] = cls
 
-    def initialize_cache(self, key: GuidedDecodingKey) -> Self:
+    def initialize_cache(self, key: GuidedDecodingKey, max_threads: int = 8):
         request_type, grammar_spec = key
-        tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-            tokenizer, stop_token_ids=stop_token_ids, vocab_size=vocab_size)
+        tokenizer_info = xgr.TokenizerInfo.from_huggingface(self.tokenizer)
         compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=max_threads)
         if request_type == "json":
             if type(grammar_spec) is not str:
@@ -98,6 +125,7 @@ def initialize_cache(self, key: GuidedDecodingKey) -> Self:
             ctx = compiler.compile_grammar(grammar_spec)
         else:
             raise ValueError("grammar is not of valid supported types.")
-        return Grammar(matcher=xgr.GrammarMatcher(ctx),
-                       vocab_size=self.model_config.hf_text_config.vocab_size,
-                       ctx=ctx)
+        return Grammar(
+            matcher=xgr.GrammarMatcher(ctx),
+            vocab_size=self.vllm_config.model_config.hf_text_config.vocab_size,
+            ctx=ctx)
@@ -14,8 +14,7 @@
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
-from vllm.v1.core.guided_decoding import GuidedDecodingManager
-from vllm.v1.core.guided_decoding.grammar import Grammar
+from vllm.v1.core.guided_decoding import Grammar
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.metrics.stats import SchedulerStats
@@ -40,13 +39,11 @@ def __init__(
         cache_config: CacheConfig,
         parallel_config: ParallelConfig,
         lora_config: Optional[LoRAConfig],
-        decoding_config: DecodingConfig,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
         self.model_config = model_config
-        self.decoding_config = decoding_config
         # TODO: Support LoRA.
         assert lora_config is None, "V1 does not support LoRA yet."
         # Scheduling constraints.
@@ -103,21 +100,6 @@ def __init__(
         self.encoder_cache_manager = EncoderCacheManager(
             cache_size=encoder_cache_size)
 
-        # A request queue for grammar compilation
-        self.grammar: Deque[Request] = deque()
-        # initialize the tokenizer on the scheduler (this is used for constrained decoding)
-        tokenizer_group = init_tokenizer_from_configs(
-            model_config=model_config,
-            scheduler_config=scheduler_config,
-            parallel_config=parallel_config,
-            lora_config=lora_config)
-        tokenizer_group.ping()
-        # setup guided decoding, right now uses xgrammar
-        self.guided_decoding_manager = GuidedDecodingManager(
-            backend=decoding_config.guided_decoding_backend,
-            tokenizer_group=tokenizer_group,
-            model_config=model_config)
-
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
@@ -133,25 +115,6 @@ def schedule(self) -> "SchedulerOutput":
         scheduled_running_reqs: List[Request] = []
         preempted_reqs: List[Request] = []
 
-        # we need to check the grammar queue for any requests that have finished FSM compilation
-        newly_grammar_reqs: List[Request] = []
-        scheduled_grammar_reqs: Deque[Request] = deque()
-        while self.grammar:
-            request = self.grammar.popleft()
-            try:
-                # When request first added via add_request, then it will be a future call
-                # check timeout and add it directly to previous queue
-                request.grammar = request.grammar.result(timeout=0.05)
-                request.status = RequestStatus.WAITING
-                newly_grammar_reqs.append(request)
-            except futures._base.TimeoutError:
-                scheduled_grammar_reqs.append(request)
-        self.grammar = scheduled_grammar_reqs
-
-        # append all newly ready requests to waiting queue with higher priority
-        for req in newly_grammar_reqs:
-            self.waiting.appendleft(req)
-
         req_to_new_block_ids: Dict[str, List[int]] = {}
         num_scheduled_tokens: Dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
@@ -238,13 +201,6 @@ def schedule(self) -> "SchedulerOutput":
                     self.encoder_cache_manager.allocate(request, i)
                 encoder_budget = new_encoder_budget
 
-            # Track if we need guided decoding
-            # Create individual bitmask for requests with grammar
-            if request.grammar is not None:
-                if request.request_id not in guided_decoding_bitmasks:
-                    bitmask = request.grammar.allocate_bitmask(1, vocab_size)
-                    guided_decoding_bitmasks[request.request_id] = bitmask
-
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting:
@@ -258,7 +214,8 @@ def schedule(self) -> "SchedulerOutput":
                 request = self.waiting[0]
 
                 # allocate bitmask on request on first round
-                if request.grammar: request.allocate_grammar_bitmask(vocab_size=vocab_size)
+                if request.grammar:
+                    request.allocate_grammar_bitmask(vocab_size=vocab_size)
 
                 # Get already-cached tokens.
                 computed_blocks, num_computed_tokens = \
@@ -356,8 +313,12 @@ def schedule(self) -> "SchedulerOutput":
         ]
         running_reqs_data = [
             self._make_running_request_data(
-                req, req_to_new_block_ids[req.request_id],
-                req.num_computed_tokens, grammar=req.grammar, grammar_bitmask=req.grammar_bitmask) for req in scheduled_running_reqs
+                req,
+                req_to_new_block_ids[req.request_id],
+                req.num_computed_tokens,
+                grammar=req.grammar,
+                grammar_bitmask=req.grammar_bitmask)
+            for req in scheduled_running_reqs
         ]
         preempted_req_ids = {req.request_id for req in preempted_reqs}
 
@@ -375,7 +336,6 @@ def schedule(self) -> "SchedulerOutput":
             # It contains the request IDs that are finished in between
             # the previous and the current steps.
             finished_req_ids=self.finished_req_ids,
-            guided_decoding_bitmasks=guided_decoding_bitmasks,
             free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
         )
 
@@ -398,7 +358,7 @@ def _make_running_request_data(
             req_data.new_block_ids = new_block_ids
             req_data.num_computed_tokens = num_computed_tokens
             req_data.grammar = grammar
-            req_data.grammar_bitmask=grammar_bitmask
+            req_data.grammar_bitmask = grammar_bitmask
         else:
             req_data = RunningRequestData.from_request(request, new_block_ids,
                                                        num_computed_tokens)
@@ -480,6 +440,8 @@ def update_from_output(
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
     ) -> EngineCoreOutputs:
+        # concern: batchsize >>>1000
+        # compilation << update
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
@@ -560,11 +522,7 @@ def _check_stop(self, request: Request) -> bool:
 
     def add_request(self, request: Request) -> None:
         self.requests[request.request_id] = request
-
-        if self.guided_decoding_manager.collect(request):
-            self.grammar.append(request)
-        else:
-            self.waiting.append(request)
+        self.waiting.append(request)
 
     def finish_requests(
         self,
@@ -648,7 +606,8 @@ def from_request(
                    sampling_params=request.sampling_params,
                    block_ids=block_ids,
                    num_computed_tokens=num_computed_tokens,
-                   grammar=request.grammar, grammar_bitmask=request.grammar_bitmask)
+                   grammar=request.grammar,
+                   grammar_bitmask=request.grammar_bitmask)
 
 
 @dataclass
@@ -671,7 +630,8 @@ def from_request(
         return cls(req_id=request.request_id,
                    block_ids=block_ids,
                    num_computed_tokens=num_computed_tokens,
-                   grammar=request.grammar, grammar_bitmask=request.grammar_bitmask)
+                   grammar=request.grammar,
+                   grammar_bitmask=request.grammar_bitmask)
 
 
 @dataclass
@@ -694,7 +654,8 @@ def from_request(
         return cls(req_id=request.request_id,
                    new_block_ids=new_block_ids,
                    num_computed_tokens=num_computed_tokens,
-                   grammar=request.grammar, grammar_bitmask=request.grammar_bitmask)
+                   grammar=request.grammar,
+                   grammar_bitmask=request.grammar_bitmask)
 
 
 @dataclass
 
@@ -16,6 +16,7 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
+from vllm.v1.core.guided_decoding import GuidedDecodingManager
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
@@ -67,6 +68,28 @@ def __init__(
         self.mm_input_mapper_server = MMInputMapperServer(
             vllm_config.model_config)
 
+        # initialize the tokenizer on the scheduler (this is used for constrained decoding)
+        tokenizer_group = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)
+        tokenizer_group.ping()
+        # setup guided decoding, right now uses xgrammar
+        self.guided_decoding_manager = GuidedDecodingManager(
+            vllm_config=vllm_config, tokenizer_group=tokenizer_group)
+
+        # while self.grammar:
+        #     request = self.grammar.popleft()
+        #     try:
+        #         # When request first added via add_request, then it will be a future call
+        #         # check timeout and add it directly to previous queue
+        #         request.grammar = request.grammar.result(timeout=0.05)
+        #         request.status = RequestStatus.WAITING
+        #         newly_grammar_reqs.append(request)
+        #     except futures._base.TimeoutError:
+        #         scheduled_grammar_reqs.append(request)
+
     def _initialize_kv_caches(self,
                               vllm_config: VllmConfig) -> Tuple[int, int]:
         start = time.time()
@@ -127,6 +150,9 @@ def step(self) -> EngineCoreOutputs:
 
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
+        # update FSM async here
+        # two broadcast (bitmask + calculate) <-- manager
+        # copy CPU -> CPU IPC (concat multiple bitmask?)
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, output)
         return engine_core_outputs