fix

ruisearch42 · ruisearch42 · commit 064a94dda15c · 2025-05-20T21:57:42.000Z
Signed-off-by: Rui Qiao &lt;ruisearch42@gmail.com&gt;
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
@@ -19,8 +19,9 @@
     model="ibm-research/PowerMoE-3b",
     enforce_eager=True,
     disable_log_requests=True,
-    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+    tensor_parallel_size=int(os.getenv("TP_SIZE", 2)),
     data_parallel_size=int(os.getenv("DP_SIZE", 2)),
+    data_parallel_address="172.31.15.128",
 )
 
 if not current_platform.supports_v1(engine_args.create_model_config()):
@@ -62,10 +63,10 @@ async def generate(engine: AsyncLLM,
     "output_kind",
     [
         RequestOutputKind.DELTA,
-        RequestOutputKind.FINAL_ONLY,
+        # RequestOutputKind.FINAL_ONLY,
     ],
 )
-@pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
+@pytest.mark.parametrize("data_parallel_backend", ["ray"])
 @pytest.mark.asyncio
 async def test_load(output_kind: RequestOutputKind,
                     data_parallel_backend: str):
diff --git a/vllm/config.py b/vllm/config.py
@@ -1847,6 +1847,10 @@ def __post_init__(self) -> None:
                                      "please install Ray with `pip install "
                                      "ray`.") from ray_utils.ray_import_err
                 backend = "ray"
+            elif self.data_parallel_backend == "ray":
+                logger.info("Using ray distributed inference because "
+                            "data_parallel_backend is ray")
+                backend = "ray"
             elif ray_found:
                 if self.placement_group:
                     backend = "ray"
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -904,10 +904,10 @@ def __init__(
         executor_fail_callback = lambda: input_queue.put_nowait(
             (EngineCoreRequestType.EXECUTOR_FAILED, b''))
 
-        input_addresses: list[str] = addresses["input_address"]
-        output_addresses: list[str] = addresses["output_address"]
-        coord_in_addr: Optional[str] = addresses.get("coord_in_addr")
-        coord_out_addr: Optional[str] = addresses.get("coord_out_addr")
+        input_addresses: list[str] = addresses["input_addresses"]
+        output_addresses: list[str] = addresses["output_addresses"]
+        coord_in_addr: Optional[str] = addresses.get("coord_in_address")
+        coord_out_addr: Optional[str] = addresses.get("coord_out_address")
         self.client_count = len(output_addresses)
         self.coordinator = coord_out_addr is not None
 
@@ -921,14 +921,17 @@ def __init__(
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
         self.counter = 0
+        self.current_wave = 0
 
         # Initialize engine core and model.
         EngineCore.__init__(self, vllm_config, executor_class, log_stats,
                             executor_fail_callback)
 
+        self.engine_index = engine_index
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
         self.engines_running = False
+        self.last_counts = (0, 0)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -1071,7 +1071,9 @@ def __init__(
         executor_class: type[Executor],
         log_stats: bool,
         client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
     ):
+        self.client_index = client_index
         self.current_wave = 0
         self.engines_running = False
         self.reqs_in_flight: dict[str, CoreEngine] = {}
@@ -1085,6 +1087,14 @@ def __init__(
         sync_ctx = zmq.Context(io_threads=2)
         self.ctx = zmq.asyncio.Context(sync_ctx)
 
+        # List of [waiting, running] pair per engine.
+        self.lb_engines: list[list[int]] = []
+        self.first_req_sock_addr = get_open_zmq_inproc_path()
+        self.first_req_send_socket = make_zmq_socket(self.ctx,
+                                                     self.first_req_sock_addr,
+                                                     zmq.PAIR,
+                                                     bind=True)
+
         # This will ensure resources created so far are closed
         # when the client is garbage collected, even if an
         # exception is raised mid-construction.