separate logprobs graph

NickLucche · NickLucche · commit 632c82d78125 · 2025-04-28T11:34:16.000Z
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
@@ -32,8 +32,8 @@ class TPUSupportedSamplingMetadata:
     all_greedy: bool = True
 
     # Maximum number of top logprobs requested in current batch.
-    # TODO use constant from sampler.py OR a bool
-    max_num_logprobs: Optional[int] = 24
+    # TODO specify why bool
+    logprobs: bool = False
 
     # TODO No penalties for now
     no_penalties: bool = True
@@ -85,10 +85,12 @@ def from_input_batch(
                 we want to pre-compile a graph with sampling parameters, even if
                 they are not strictly needed for greedy decoding.
         """
+        needs_logprobs = input_batch.max_num_logprobs>0 if \
+            input_batch.max_num_logprobs else False
         # Early return to avoid unnecessary cpu to tpu copy
         if (input_batch.all_greedy is True
                 and generate_params_if_all_greedy is False):
-            return cls(all_greedy=True)
+            return cls(all_greedy=True, logprobs=needs_logprobs)
 
         num_reqs = input_batch.num_reqs
 
@@ -117,4 +119,4 @@ def fill_slice(cpu_tensor: torch.Tensor, fill_val) -> torch.Tensor:
                 xla_device),
             min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
                 xla_device),
-            max_num_logprobs=input_batch.max_num_logprobs)
+            logprobs=needs_logprobs)
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
@@ -9,14 +9,15 @@
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 
 _SAMPLING_EPS = 1e-5
-MAX_TOP_LOGPROBS_TO_GATHER = 24
 
 
 class Sampler(nn.Module):
 
-    def __init__(self):
+    def __init__(self, max_logprobs: int):
         super().__init__()
         self.topk_topp_sampler = TopKTopPSampler()
+        # Gather a fixed amount of top logprobs. Defaults to 20.
+        self.max_logprobs = max_logprobs
 
     def forward(
         self,
@@ -37,9 +38,12 @@ def forward(
         # Gather the top_logprobs with corresponding tokens. Use a fixed number
         # of logprobs as an alternative to having multiple pre-compiled graphs.
         # Select the logprobs actually demanded by each request on CPU.
-        logprobs_tensors = self.gather_logprobs(raw_logprobs, 
-                                                MAX_TOP_LOGPROBS_TO_GATHER, 
-                                                token_ids=sampled)
+        if sampling_metadata.logprobs:
+            logprobs_tensors = self.gather_logprobs(raw_logprobs,
+                                                    self.max_logprobs,
+                                                    token_ids=sampled)
+        else:
+            logprobs_tensors = None
 
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
@@ -36,7 +36,7 @@
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
-from vllm.v1.sample.tpu.sampler import MAX_TOP_LOGPROBS_TO_GATHER, Sampler as TPUSampler
+from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
@@ -790,7 +790,7 @@ def execute_model(
         # Remove padding on cpu and keep dynamic op outside of xla graph.
         selected_token_ids = selected_token_ids.cpu()[:num_reqs]
         logprobs_lists = logprobs.tolists() \
-            if tpu_sampling_metadata.max_num_logprobs else None
+            if tpu_sampling_metadata.logprobs else None
 
         # Update the cache state concurrently. Code above will not block until
         # we use `selected_token_ids`. Add mark_step if post-processing changes
@@ -893,7 +893,7 @@ def load_model(self) -> None:
         xm.mark_step()
         xm.wait_device_ops()
         self.model = model
-        self.sampler = TPUSampler()
+        self.sampler = TPUSampler(self.model_config.max_logprobs)
 
     @torch.no_grad()
     def _dummy_run(self, num_tokens: int) -> None:
@@ -1104,16 +1104,17 @@ def _precompile_sample_from_logits(self) -> None:
             # because some operations in the sampler require it to be static.
             for all_greedy in [False, True]:
                 generate_params_if_all_greedy = not all_greedy
-                sampling_metadata = (
-                    TPUSupportedSamplingMetadata.from_input_batch(
-                        self.input_batch,
-                        num_reqs,
-                        self.device,
-                        generate_params_if_all_greedy,
-                    ))
-                print("COMPILING", sampling_metadata.max_num_logprobs)
-                sampling_metadata.all_greedy = all_greedy
-                self.sample_from_logits(dummy_logits, sampling_metadata)
+                for top_logprobs in [False, True]:
+                    sampling_metadata = (
+                        TPUSupportedSamplingMetadata.from_input_batch(
+                            self.input_batch,
+                            num_reqs,
+                            self.device,
+                            generate_params_if_all_greedy,
+                        ))
+                    sampling_metadata.logprobs = top_logprobs
+                    sampling_metadata.all_greedy = all_greedy
+                    self.sample_from_logits(dummy_logits, sampling_metadata)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -1256,14 +1257,19 @@ def sample_from_logits(
         """
         Sample with xla-friendly function. This function is to be traced 
         separately from `forward` for lighter compilation overhead.
+        Optionally (in a separate graph) returns top-logprobs too, by gathering
+        a fixed maximum number of logprobs for the whole batch, 20 by default.
         """
         if sampling_metadata.all_greedy:
             out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
-            # TODO skip if not needed and compile for it
-            logprobs = self.sampler.compute_logprobs(logits)
-            logprobs_tensors = self.sampler.gather_logprobs(logprobs, 
-                                    MAX_TOP_LOGPROBS_TO_GATHER, 
-                                    token_ids=out_tokens.squeeze(-1))
+            if sampling_metadata.logprobs:
+                logprobs = self.sampler.compute_logprobs(logits)
+                logprobs_tensors = self.sampler.gather_logprobs(
+                    logprobs,
+                    self.model_config.max_logprobs,
+                    token_ids=out_tokens.squeeze(-1))
+            else:
+                logprobs_tensors = None
         else:
             sampler_out = self.sampler(logits, sampling_metadata)
             out_tokens = sampler_out.sampled_token_ids