[Test] Add tests for FlashInfer sampler

Bowen Wang · Bowen Wang · commit 9a5040b53d54 · 2025-05-01T20:49:16.000-07:00
Signed-off-by: Bowen Wang &lt;abmfy.icloud.com&gt;
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
@@ -2,13 +2,26 @@
 import torch
 from torch import Generator
 
-from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import (
+    is_flashinfer_available,
+    apply_top_k_top_p,
+)
+
+from flashinfer.sampling import (
+    top_k_renorm_probs,
+    top_p_renorm_probs,
+)
 
 DEVICE = "cuda"
 
 BATCH_SIZE = 1024
 VOCAB_SIZE = 128 * 1024
 
+FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
+
 
 def test_topk_impl_equivalance():
 
@@ -35,3 +48,62 @@ def test_topk_impl_equivalance():
         result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
 
         assert torch.allclose(result1, result2)
+
+def test_flashinfer_sampler():
+    '''
+    This test verifies that the FlashInfer top-k and top-p sampling
+    implementation produces the same results as the Python implementation.
+
+    NOTE: FlashInfer did not directly expose an interface for fused top-k and 
+    top-p prob renorm (it did provide fused sampling but we cannot compare 
+    sampling results due to randomness), so we will compare the probability
+    renormed consequently by top-k and then top-p of FlashInfer implementation.
+    '''
+
+    if not FLASHINFER_ENABLED:
+        pytest.skip("FlashInfer not installed or not available on this platform.")
+    
+    with torch.device(DEVICE):
+        generator = Generator(device=DEVICE).manual_seed(42)
+        
+        # Generate random logits
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+        
+        # Generate various top-k and top-p values
+        k_values = torch.randint(1, 1000, (BATCH_SIZE,), generator=generator)
+        p_values = torch.rand((BATCH_SIZE,), generator=generator) * 0.5 + 0.5  # range in [0.5, 1.0]
+        
+        # Sometimes disable top-k (k=vocab_size)
+        k_values.masked_fill_(
+            torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool), 
+            VOCAB_SIZE)
+        
+        # Sometimes disable top-p (p=1.0)
+        p_values.masked_fill_(
+            torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool), 
+            1.0)
+
+        python_logits = apply_top_k_top_p(
+            logits=logits.clone(),
+            k=k_values,
+            p=p_values,
+        )
+        python_probs = torch.softmax(python_logits, dim=-1)
+
+        import IPython
+        IPython.embed()
+
+        # FlashInfer only exposed renorm interfaces for probs so convert first
+        flashinfer_probs = torch.softmax(logits.clone(), dim=-1)
+        flashinfer_probs = top_k_renorm_probs(
+            probs=flashinfer_probs,
+            top_k=k_values,
+        )
+        flashinfer_probs = top_p_renorm_probs(
+            probs=flashinfer_probs,
+            top_p=p_values,
+        )
+
+        # Compare the results
+        assert torch.allclose(python_probs, flashinfer_probs, atol=1e-5), \
+            "FlashInfer and Python sampling implementations do not match!"