[BUGFIX] Move scores to float32 in case of running xgrammar on cpu (vllm-project#12152)

madamczyk-intel · Isotr0py · commit f8a0f11de4d1 · 2025-02-02T21:35:02.000+08:00
Signed-off-by: Michal Adamczyk &lt;madamczyk@habana.ai&gt;
Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -298,16 +298,19 @@ def __call__(self, input_ids: list[int],
         # token_bitmask is a CPU tensor for use with accept_token and
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
+        dtype = scores.dtype
         if device_type != "cuda":
-            scores = scores.to("cpu").unsqueeze(0)
+            # xgrammar on cpu only supports float32 scores
+            # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
+            scores = scores.to("cpu").float().unsqueeze(0)
 
         # Note: In this method, if the tensors have different dimensions
         # on CPU device fails, but on GPU it runs without error. Hence the
         # unsqueeze above for scores, to match the token bitmask shape
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type).squeeze()
+            scores = scores.to(dtype).to(device_type).squeeze()
 
         return scores