fix recompilation issue on sampling graph; add new tpu sampler

NickLucche · NickLucche · commit 50ef5551d43f · 2025-03-12T15:38:03.000Z
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -89,7 +89,9 @@ def forward_tpu(
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
         # TODO Placeholder for TPU optimized topk/p kernel
-        return self.forward_native(logits, generators, k, p)
+        # logits = apply_top_k_top_p(logits, k, p)
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        return random_sample(probs, generators)
 
 
 def apply_top_k_top_p(
diff --git a/vllm/v1/sample/tpu/__init__.py b/vllm/v1/sample/tpu/__init__.py
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
@@ -8,8 +8,6 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 
 
-# TODO (NickLucche) keep in sync with SamplingMetadata until we can drop
-# this class and support most options.
 @dataclass
 class TPUSupportedSamplingMetadata:
     # This class exposes a more xla-friendly interface than SamplingMetadata
@@ -158,4 +156,4 @@ def _get_default_params_values():
             # frequency_penalties=(0.0, torch.float32),
             # presence_penalties=(0.0, torch.float32),
             # repetition_penalties=(0.0, torch.float32),
-        )
+        )
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Sampler layer implementing TPU supported operations."""
+
+import torch
+import torch.nn as nn
+
+from vllm.v1.outputs import LogprobsTensors, SamplerOutput
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
+from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
+
+_SAMPLING_EPS = 1e-5
+
+
+class Sampler(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.topk_topp_sampler = TopKTopPSampler()
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: TPUSupportedSamplingMetadata,
+    ) -> SamplerOutput:
+        # NOTE(woosuk): Use the original logits (before any penalties or
+        # temperature scaling) for the top-k logprobs.
+        # This is different from the V0 sampler, which uses the logits that
+        # is used for sampling (after penalties and temperature scaling).
+
+        # Use float32 for the logits.
+        logits = logits.to(torch.float32)
+        # Sample the next token.
+        sampled = self.sample(logits, sampling_metadata)
+
+        # Use int32 to reduce the tensor size.
+        sampled = sampled.to(torch.int32)
+
+        # These are GPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.unsqueeze(-1),
+            logprobs_tensors=None,
+        )
+        return sampler_output
+
+    def apply_temperature(
+        self,
+        logits: torch.Tensor,
+        temp: torch.Tensor,
+    ) -> torch.Tensor:
+        # Use in-place division to avoid creating a new tensor.
+        return logits.div_(temp.unsqueeze(dim=1))
+
+    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.argmax(dim=-1).view(-1)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: TPUSupportedSamplingMetadata,
+    ) -> torch.Tensor:
+        greedy_sampled = self.greedy_sample(logits)
+
+        assert sampling_metadata.temperature is not None
+
+        # Apply temperature.
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+
+        # Apply min_p.
+        if sampling_metadata.min_p is not None:
+            logits = self.apply_min_p(logits, sampling_metadata.min_p)
+
+        # Apply top_k and/or top_p.
+        random_sampled = self.topk_topp_sampler(
+            logits,
+            sampling_metadata.generators,
+            sampling_metadata.top_k,
+            sampling_metadata.top_p,
+        )
+
+        sampled = torch.where(sampling_metadata.temperature < _SAMPLING_EPS,
+                              greedy_sampled, random_sampled)
+        return sampled
+
+    def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.log_softmax(dim=-1, dtype=torch.float32)
+
+    def gather_logprobs(
+        self,
+        logprobs: torch.Tensor,
+        num_logprobs: int,
+        token_ids: torch.Tensor,
+    ) -> LogprobsTensors:
+        """
+        Gather logprobs for topk and sampled/prompt token.
+
+        Args:
+          logits: (num tokens) x (vocab) tensor
+          num_logprobs: minimum number of logprobs to
+                        retain per token
+          token_ids: prompt tokens (if prompt logprobs)
+                     or sampled tokens (if sampled
+                     logprobs); 1D token ID tensor
+                     with (num tokens) elements
+
+        Returns:
+          Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
+          Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
+          Sampled token rank tensor, (num tokens)
+        """
+        # Find the topK values.
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 num_logprobs,
+                                                 dim=-1)
+
+        # Get with the logprob of the prompt or sampled token.
+        token_ids = token_ids.unsqueeze(-1)
+        token_logprobs = logprobs.gather(-1, token_ids)
+
+        # Compute the ranks of the actual token.
+        token_ranks = (logprobs >= token_logprobs).sum(-1)
+
+        # Concatenate together with the topk.
+        indices = torch.cat((token_ids, topk_indices), dim=1)
+        logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
+
+        # Use int32 to reduce the tensor size.
+        indices = indices.to(torch.int32)
+
+        return LogprobsTensors(indices, logprobs, token_ranks)
+
+    def apply_min_p(
+        self,
+        logits: torch.Tensor,
+        min_p: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Filters logits using adaptive probability thresholding.
+        """
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values,
+                                       dim=-1,
+                                       keepdim=True)
+        # Reshape min_p for broadcasting
+        adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
+        # Identify valid tokens using threshold comparison
+        valid_token_mask = probability_values >= adjusted_min_p
+        # Apply mask using boolean indexing (xla friendly)
+        logits.masked_fill_(~valid_token_mask, -float("inf"))
+        return logits
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
@@ -23,16 +23,16 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
+                                               PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput, SamplerOutput)
-from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.sampler import Sampler
-from vllm.v1.sample.tpu_metadata import TPUSupportedSamplingMetadata
+from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
+from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
@@ -142,9 +142,10 @@ def __init__(
                                             dtype=torch.int64,
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
-
+        padded_max_num_blocks_per_req = _get_padded_number(
+            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, self.max_num_blocks_per_req),
+            (self.max_num_tokens, padded_max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 
@@ -595,8 +596,6 @@ def execute_model(
         tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
             from_sampling_metadata(sampling_metadata, logits_indices,
                                     num_reqs, self.device)
-        # Make mypy happy
-        sampling_metadata = cast(SamplingMetadata, tpu_sampling_metadata)
         # Run the decoder
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
@@ -605,9 +604,8 @@ def execute_model(
                 kv_caches=self.kv_caches,
                 inputs_embeds=inputs_embeds,
             )
-        xm.mark_step()  # break model graph
         selected_token_ids = self.model.sample_from_hidden(
-            hidden_states, sampling_metadata)
+            hidden_states, tpu_sampling_metadata)
         # Remove padding on cpu and keep dynamic op outside of xla graph.
         selected_token_ids = selected_token_ids.cpu()[:num_reqs]
 
@@ -780,11 +778,21 @@ def capture_model(self) -> None:
         # n_tokens x max_num_reqs. Graph is really small so this is fine.
         while True:
             num_reqs_to_sample = MIN_NUM_SEQS
-            dummy_hidden = torch.randn((num_tokens, hsize), device=device)
+            dummy_hidden = torch.randn((num_tokens, hsize),
+                                       device=device,
+                                       dtype=torch.bfloat16)
             while True:
-                # To allow sampling, trace with all supported sampling args.
+                # Default metadata is an all_greedy setup. But since the
+                # `do_argmax` flag is a tensor, we still compile the full graph
+                meta = self.input_batch.sampling_metadata
+                indices = torch.zeros(
+                    num_reqs_to_sample,
+                    dtype=torch.int32,
+                    device=device,
+                )
                 sampling_meta = TPUSupportedSamplingMetadata.\
-                    get_default_sampling_params(num_reqs_to_sample, device)
+                    from_sampling_metadata(meta, indices,
+                                           num_reqs_to_sample, device)
                 logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
                             num_reqs_to_sample)
                 self.model.sample_from_hidden(dummy_hidden, sampling_meta)
@@ -843,7 +851,7 @@ class ModelWrapperV1(nn.Module):
     def __init__(self, model: nn.Module):
         super().__init__()
         self.model = model
-        self.sampler = Sampler()
+        self.sampler = TPUSampler()
 
     def sample(
             self, logits: torch.Tensor,
@@ -912,6 +920,10 @@ def get_input_embeddings(self, *args, **kwargs):
         return self.model.get_input_embeddings(*args, **kwargs)
 
 
+def _get_padded_number(n: int, multiple: int) -> int:
+    return ((n + multiple - 1) // multiple) * multiple
+
+
 def _get_padded_token_len(x: int) -> int:
     if x <= 16:
         return 16