[BugFix] Lazily import XgrammarBackend to avoid early cuda init (#15171)

njhill · web-flow · commit c47aafa37c75 · 2025-03-20T01:30:43.000Z
Signed-off-by: Nick Hill &lt;nhill@redhat.com&gt;
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -9,7 +9,6 @@
 from vllm.logger import init_logger
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar)
-from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
 
 if TYPE_CHECKING:
     import numpy as np
@@ -47,6 +46,9 @@ def grammar_init(self, request: Request) -> None:
         if self.backend is None:
             backend_name = request.sampling_params.guided_decoding.backend_name
             if backend_name == "xgrammar":
+                from vllm.v1.structured_output.backend_xgrammar import (
+                    XgrammarBackend)
+
                 self.backend = XgrammarBackend(self.vllm_config)
             else:
                 raise ValueError(