address review

NickLucche · NickLucche · commit ca564f520dd0 · 2025-04-08T14:34:28.000Z
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
@@ -13,13 +13,17 @@
 
 
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
+@pytest.mark.parametrize("disable_sampler", [False, True])
 @pytest.mark.skipif(not current_platform.is_tpu(),
                     reason="This test needs a TPU")
-def test_sampler_different(model_name: str):
+def test_sampler_different(model_name: str, disable_sampler: bool,
+                           monkeypatch):
     """
     Test significantly different sampling params to assert the model produces 
     different results.
     """
+    if disable_sampler:
+        monkeypatch.setenv("VLLM_TPU_DISABLE_SAMPLER_DEBUG", "1")
     llm = LLM(model_name,
               enforce_eager=False,
               max_num_seqs=1,
@@ -33,4 +37,8 @@ def test_sampler_different(model_name: str):
 
     sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
     output2 = llm.generate(prompts, sampling_params)
-    assert output[0].outputs[0].text != output2[0].outputs[0].text
+    if disable_sampler:
+        # When sampler is off, params are accepted but ignored (argmax-only).
+        assert output[0].outputs[0].text == output2[0].outputs[0].text
+    else:
+        assert output[0].outputs[0].text != output2[0].outputs[0].text
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
@@ -684,34 +684,26 @@ def execute_model(
             inputs_embeds = None
         num_reqs = self.input_batch.num_reqs
 
-        # Temporary debug pathway.
+        with set_forward_context(attn_metadata, self.vllm_config):
+            hidden_states = self.model(
+                input_ids=input_ids,
+                positions=self.position_ids,
+                kv_caches=self.kv_caches,
+                inputs_embeds=inputs_embeds,
+            )
+        # Temporary debug pathway for sampling.
         if self._disable_sampler:
-            with set_forward_context(attn_metadata, self.vllm_config):
-                hidden_states = self.model(
-                    input_ids=input_ids,
-                    positions=self.position_ids,
-                    kv_caches=self.kv_caches,
-                    inputs_embeds=inputs_embeds,
-                )
             selected_token_ids = self.model.compute_logits_no_sampler(
                 hidden_states, logits_indices)
-            selected_token_ids = selected_token_ids.cpu()[:num_reqs]
         else:
             # NOTE (NickLucche) here we sync with TPU: sampling params tensors
             # are copied to device in chunks of pre-compiled padded shape to
             # avoid recompilations.
             tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
                 from_input_batch(self.input_batch, logits_indices)
-            with set_forward_context(attn_metadata, self.vllm_config):
-                hidden_states = self.model(
-                    input_ids=input_ids,
-                    positions=self.position_ids,
-                    kv_caches=self.kv_caches,
-                    inputs_embeds=inputs_embeds,
-                )
-                selected_token_ids = self.model.sample_from_hidden(
-                    hidden_states, tpu_sampling_metadata)
-                selected_token_ids = selected_token_ids.cpu()[:num_reqs]
+            selected_token_ids = self.model.sample_from_hidden(
+                hidden_states, tpu_sampling_metadata)
+        selected_token_ids = selected_token_ids.cpu()[:num_reqs]
 
         # Update the cache state concurrently. Code above will not block until
         # we use `selected_token_ids`. Add mark_step if post-processing changes