[V1] TPU - Add tensor parallel support via Ray

alexm-redhat · alexm-redhat · commit 28ad766247e9 · 2025-03-03T16:22:57.000Z
Signed-off-by: Alexander Matveev &lt;amatveev@redhat.com&gt;
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
@@ -42,6 +42,10 @@ def run_test(more_args=None):
             ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
+# TODO: [AlexM] Fix it with new CI/CD tests
+TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
+
+
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
@@ -56,6 +60,10 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
             # Limit compilation time for TPU V1
             more_args = "max_num_seqs=64"
 
+            # Add TP test (if provided)
+            if TPU_TP_TEST_STR:
+                more_args += ",{}".format(TPU_TP_TEST_STR)
+
         run_test(more_args)
 
 
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
@@ -54,9 +54,14 @@ class RayDistributedExecutor(DistributedExecutorBase):
     def _init_executor(self) -> None:
         self.forward_dag: Optional[ray.dag.CompiledDAG] = None
         if envs.VLLM_USE_V1:
-            # v1 always uses the compiled DAG and SPMD worker.
+            # V1 uses SPMD worker and compiled DAG
             os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
             os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1"
+
+            # For TPU, avoid compiling NVIDIA's NCCL
+            if current_platform.is_tpu():
+                os.environ["VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"] = "0"
+
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
@@ -6,11 +6,13 @@
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import msgspec
+import torch
 
 import vllm.platforms
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -106,10 +108,14 @@ def setup_device_if_necessary(self):
             # on a background thread, so we need to reset torch's current
             # device.
             # We can remove this API after it is fixed in compiled graph.
-            import torch
             assert self.worker is not None, "Worker is not initialized"
             if not self.compiled_dag_cuda_device_set:
-                torch.cuda.set_device(self.worker.device)
+                if current_platform.is_tpu():
+                    # TODO: [AlexM] Verify if set_device is necessary here
+                    pass
+                else:
+                    torch.cuda.set_device(self.worker.device)
+
                 self.compiled_dag_cuda_device_set = True
 
         def execute_model_ray(
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
@@ -18,6 +18,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.sampling_params import SamplingType
+from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
 from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
                                                NUM_QUERIES_PER_BLOCK,
@@ -430,6 +431,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> ModelRunnerOutput:
         # Update cached state
         self._update_states(scheduler_output)