[TPU][V1] Refine tpu_model_runner to mitigate future recompilation issues (#16275)

yaochengji · web-flow · commit a454748544f6 · 2025-04-09T18:51:51.000-06:00
Signed-off-by: Chengji Yao &lt;chengjiyao@google.com&gt;
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
@@ -44,23 +44,29 @@ def test_tpu_compilation():
             assert generated_text.startswith(answer)
 
     compiled_codes = sorted(
-        glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
+        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
 
     for i, compiled_code in enumerate(compiled_codes):
         print("{} file: {}".format(i + 1, compiled_code))
 
     # We should only trigger Dynamo compilation 2 times:
     # 1. Forward pass without kv_caches
     # 2. Forward pass with kv_caches
-    # Check we have 4 compiled codes
+    # Check we have 2 compiled codes
     assert len(compiled_codes) == 2
 
     kv_cache_prefix = "kv_cache"
     attn_prefix = "ragged_paged_attention"
 
+    def extract_compiled_index(s):
+        parts = s.replace(".", "_").split("_")
+        numbers = [int(part) for part in parts if part.isdigit()]
+        return numbers[0]
+
     # Check all the compilations are as expected
-    compiled_fns = sorted(
-        glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
+    compiled_fns = sorted(glob.glob(
+        os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
+                          key=lambda s: extract_compiled_index(s))
 
     for i, compiled_fn in enumerate(compiled_fns):
         print("{} file: {}".format(i + 1, compiled_fn))
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,9 +7,9 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
-from vllm.v1.worker.tpu_model_runner import (TPUModelRunner,
-                                             _get_padded_token_len,
-                                             _get_paddings)
+from vllm.v1.worker.tpu_model_runner import (
+    TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
+    _get_padded_token_len, _get_req_paddings, _get_token_paddings)
 
 # Mock torch_xla module since it may not be available in the test environments
 torch_xla_patcher = mock.patch.dict(
@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner):
 def test_get_paddings():
     min_token_size, max_token_size, padding_gap = 16, 512, 64
     expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
-    actual_paddings = _get_paddings(min_token_size, max_token_size,
-                                    padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
     assert actual_paddings == expected_paddings
 
 
 def test_get_padded_token_len():
     min_token_size, max_token_size, padding_gap = 16, 512, 64
-    paddings = _get_paddings(min_token_size, max_token_size, padding_gap)
+    paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert _get_padded_token_len(paddings, 1) == 16
     assert _get_padded_token_len(paddings, 16) == 16
     assert _get_padded_token_len(paddings, 20) == 32
     assert _get_padded_token_len(paddings, 300) == 320
     assert _get_padded_token_len(paddings, 512) == 512
+
+
+def test_get_padded_num_reqs_with_upper_limit():
+    assert _get_padded_num_reqs_with_upper_limit(3, 32) == 8
+    assert _get_padded_num_reqs_with_upper_limit(9, 32) == 16
+    assert _get_padded_num_reqs_with_upper_limit(19, 32) == 32
+    assert _get_padded_num_reqs_with_upper_limit(17, 28) == 28
+
+
+def test_get_req_paddings():
+    assert _get_req_paddings(1, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
@@ -3,7 +3,6 @@
 from typing import Optional
 
 import torch
-import torch_xla.core.xla_model as xm
 
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -24,15 +23,15 @@ class TPUSupportedSamplingMetadata:
     # This class exposes a more xla-friendly interface than SamplingMetadata
     # on TPU, in particular all arguments should be traceable and no optionals
     # are allowed, to avoid graph recompilation on Nones.
-    temperature: torch.Tensor
+    temperature: torch.Tensor = None
 
-    min_p: torch.Tensor
+    min_p: torch.Tensor = None
     # Still too slow on forward_native!
     top_k: torch.Tensor = None
     top_p: torch.Tensor = None
 
     # Greedy sampling flag for compiling single xla graph.
-    all_greedy: torch.Tensor = None
+    all_greedy: bool = True
 
     # Generator not supported by xla
     generators: dict[int,
@@ -57,64 +56,58 @@ class TPUSupportedSamplingMetadata:
 
     allowed_token_ids_mask = None
     bad_words_token_ids = None
-    indices_do_sample: torch.Tensor = None
 
     @classmethod
     def from_input_batch(
-            cls, input_batch: InputBatch,
-            indices_do_sample: torch.Tensor) -> "TPUSupportedSamplingMetadata":
+        cls,
+        input_batch: InputBatch,
+        padded_num_reqs: int,
+        xla_device: torch.device,
+        generate_params_if_all_greedy: bool = False
+    ) -> "TPUSupportedSamplingMetadata":
         """
         Copy sampling tensors slices from `input_batch` to on device tensors.
 
         `InputBatch._make_sampling_metadata` causes recompilation on XLA as it 
         slices dynamic shapes on device tensors. This impl moves the dynamic 
-        ops to CPU and produces tensors of fixed `padded_num_reqs` size. It 
-        also reuses the on-device persistent tensors managed in `input_batch`
-        to reduce waste. 
-
-        `indices_do_sample` contains the indices to be fed to the  Sampler, 
-        normally one per request, here padded to the closest pre-compiled shape
-        We expect sampling params tensors to be padded to the same fixed shape.
-
-        Eg. 3 requests, tensors padded to 4 
-            temperature: [0.7, 0.2, 0.9]=>[0.7, 0.2, 0.9, 0.0]
-            sample indices: [4, 10, 11]=>indices_do_sample: [4, 10, 11, 0]
+        ops to CPU and produces tensors of fixed `padded_num_reqs` size.
+
+        Args:
+            input_batch: The input batch containing sampling parameters.
+            padded_num_reqs: The padded number of requests.
+            xla_device: The XLA device.
+            generate_params_if_all_greedy: If True, generate sampling parameters
+                even if all requests are greedy. this is useful for cases where
+                we want to pre-compile a graph with sampling parameters, even if
+                they are not strictly needed for greedy decoding.
         """
+        # Early return to avoid unnecessary cpu to tpu copy
+        if (input_batch.all_greedy is True
+                and generate_params_if_all_greedy is False):
+            return cls(all_greedy=True)
+
         num_reqs = input_batch.num_reqs
-        padded_num_reqs = len(indices_do_sample)
 
-        def copy_slice(cpu_tensor: torch.Tensor, tpu_tensor: torch.Tensor,
-                       fill_val) -> torch.Tensor:
-            # Copy slice from CPU to corresponding TPU pre-allocated tensor.
+        def fill_slice(cpu_tensor: torch.Tensor, fill_val) -> torch.Tensor:
             # Pad value is the default one.
             cpu_tensor[num_reqs:padded_num_reqs] = fill_val
-            # Subtle compilation: len(tpu_tensor) must be >= `padded_num_reqs`
-            tpu_tensor[:padded_num_reqs] = cpu_tensor[:padded_num_reqs]
 
-        # NOTE NickLucche The sync CPU-TPU graph we produce here must be
-        # consistent. We can't have flags to skip copies or we'll end up
-        # recompiling.
-        copy_slice(input_batch.temperature_cpu_tensor, input_batch.temperature,
+        fill_slice(input_batch.temperature_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["temperature"])
         # TODO Temporarily disabled until sampling options are enabled
-        # copy_slice(input_batch.top_p_cpu_tensor, input_batch.top_p)
-        # copy_slice(input_batch.top_k_cpu_tensor, input_batch.top_k)
-        copy_slice(input_batch.min_p_cpu_tensor, input_batch.min_p,
+        # fill_slice(input_batch.top_p_cpu_tensor)
+        # fill_slice(input_batch.top_k_cpu_tensor)
+        fill_slice(input_batch.min_p_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["min_p"])
 
-        xm.mark_step()
-        xm.wait_device_ops()
-
         # Slice persistent device tensors to a fixed pre-compiled padded shape.
         return cls(
-            temperature=input_batch.temperature[:padded_num_reqs],
-            # Scalar tensor for xla-friendly tracing.
-            all_greedy=torch.tensor(input_batch.all_greedy,
-                                    dtype=torch.bool,
-                                    device=input_batch.device),
+            temperature=input_batch.temperature_cpu_tensor[:padded_num_reqs].
+            to(xla_device),
+            all_greedy=input_batch.all_greedy,
             # TODO enable more and avoid returning None values
             top_p=None,  # input_batch.top_p[:padded_num_reqs],
             top_k=None,  # input_batch.top_k[:padded_num_reqs],
-            min_p=input_batch.min_p[:padded_num_reqs],
-            generators=input_batch.generators,
-            indices_do_sample=indices_do_sample)
+            min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
+            generators=input_batch.generators)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py