Skip to content

Ray DP scale out #18428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 40 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
8802521
[V1] DP scale-out (2/N): Decouple engine process management and comms
njhill Apr 2, 2025
e869380
Headless mode
njhill Apr 3, 2025
1ca3d15
Wire data_parallel_address arg
njhill Apr 4, 2025
a551183
Some code cleanup
njhill Apr 4, 2025
a662169
Fix offline DP compatibility
njhill Apr 4, 2025
b29dcf4
Merge remote-tracking branch 'refs/remotes/origin/main' into decouple…
njhill Apr 7, 2025
8126f72
Address some review comments
njhill Apr 7, 2025
8fdc6f5
Address other minor review comments
njhill Apr 7, 2025
9c90ad4
Merge remote-tracking branch 'origin/main' into decouple-engines
njhill Apr 17, 2025
80f9c98
Merge remote-tracking branch 'origin/main' into decouple-engines
njhill Apr 17, 2025
efa8ad8
Fix merge error, address @russellb's ipv6 review comment
njhill Apr 17, 2025
30ab14b
Hande ipv6 URIs in all places
njhill Apr 18, 2025
acc5af3
Fix head node with no engines, don't require dp size on other nodes
njhill Apr 19, 2025
1649d7d
Merge remote-tracking branch 'refs/remotes/origin/main' into decouple…
njhill Apr 23, 2025
4fbf90e
Merge remote-tracking branch 'origin/main' into decouple-engines
njhill Apr 23, 2025
86a0453
Merge remote-tracking branch 'refs/remotes/origin/main' into decouple…
njhill Apr 26, 2025
e70545c
Merge remote-tracking branch 'origin/main' into decouple-engines
njhill Apr 27, 2025
24b2e1e
Merge remote-tracking branch 'refs/remotes/origin/main' into decouple…
njhill May 1, 2025
c76e8e5
[Perf] API-server scaleout with all-to-all server-engine comms
njhill Apr 4, 2025
742b532
Fix engine init num_gpu_blocks logging
njhill May 1, 2025
6340c87
Improve load balancing
njhill May 5, 2025
877f195
small fixes
njhill May 6, 2025
f7a909e
Merge remote-tracking branch 'origin/main' into decouple-engines
njhill May 11, 2025
12da06b
Merge remote-tracking branch 'refs/remotes/njhill/decouple-engines' i…
njhill May 11, 2025
42c30bf
Fix test_startup_failure
njhill May 12, 2025
3904d10
Fix mock config related test failure
njhill May 12, 2025
cece58a
Merge remote-tracking branch 'origin/main' into decouple-engines
njhill May 12, 2025
811d8f4
Merge remote-tracking branch 'njhill/decouple-engines' into all-to-all
njhill May 12, 2025
02f7263
Merge remote-tracking branch 'origin/main' into decouple-engines
njhill May 12, 2025
77b7821
Merge remote-tracking branch 'njhill/decouple-engines' into all-to-all
njhill May 12, 2025
e1400f7
Merge remote-tracking branch 'refs/remotes/origin/main' into decouple…
njhill May 13, 2025
1d89c90
Merge remote-tracking branch 'refs/remotes/njhill/decouple-engines' i…
njhill May 13, 2025
5403440
Merge remote-tracking branch 'origin/main' into all-to-all
njhill May 13, 2025
1bf3a63
[Misc][DP] Fix AsyncLLM metrics for multi-API server deployments
njhill May 16, 2025
97a6e76
Graceful lifecyle management for API server and DP coordinator proces…
kouroshHakha May 18, 2025
b345724
Merge remote-tracking branch 'origin/main' into all-to-all
njhill May 18, 2025
a0c835e
Disable MM cache for api_server_count > 1
njhill May 18, 2025
f7afac6
Fix typing in test
njhill May 18, 2025
34c5eb9
Fix Process typing
njhill May 19, 2025
c55cbb8
[V1] Support DP with Ray
ruisearch42 May 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 268 additions & 0 deletions tests/entrypoints/test_api_server_process_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
# SPDX-License-Identifier: Apache-2.0

import multiprocessing
import socket
import threading
import time
from typing import Optional
from unittest.mock import patch

import pytest

from vllm.v1.utils import (APIServerProcessManager,
wait_for_completion_or_failure)

# Global variables to control worker behavior
WORKER_RUNTIME_SECONDS = 0.5


# Mock implementation of run_api_server_worker
def mock_run_api_server_worker(listen_address, sock, args, client_config=None):
"""Mock run_api_server_worker that runs for a specific time."""
print(f"Mock worker started with client_config: {client_config}")
time.sleep(WORKER_RUNTIME_SECONDS)
print("Mock worker completed successfully")


@pytest.fixture
def api_server_args():
"""Fixture to provide arguments for APIServerProcessManager."""
sock = socket.socket()
return {
"target_server_fn":
mock_run_api_server_worker,
"listen_address":
"localhost:8000",
"sock":
sock,
"args":
"test_args", # Simple string to avoid pickling issues
"num_servers":
3,
"input_addresses": [
"tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002",
"tcp://127.0.0.1:5003"
],
"output_addresses": [
"tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002",
"tcp://127.0.0.1:6003"
],
"stats_update_address":
"tcp://127.0.0.1:7000",
}


@pytest.mark.parametrize("with_stats_update", [True, False])
def test_api_server_process_manager_init(api_server_args, with_stats_update):
"""Test initializing the APIServerProcessManager."""
# Set the worker runtime to ensure tests complete in reasonable time
global WORKER_RUNTIME_SECONDS
WORKER_RUNTIME_SECONDS = 0.5

# Copy the args to avoid mutating the
args = api_server_args.copy()

if not with_stats_update:
args.pop("stats_update_address")
manager = APIServerProcessManager(**args)

try:
# Verify the manager was initialized correctly
assert len(manager.processes) == 3

# Verify all processes are running
for proc in manager.processes:
assert proc.is_alive()

print("Waiting for processes to run...")
time.sleep(WORKER_RUNTIME_SECONDS / 2)

# They should still be alive at this point
for proc in manager.processes:
assert proc.is_alive()

finally:
# Always clean up the processes
print("Cleaning up processes...")
manager.close()

# Give processes time to terminate
time.sleep(0.2)

# Verify all processes were terminated
for proc in manager.processes:
assert not proc.is_alive()


@patch("vllm.entrypoints.cli.serve.run_api_server_worker",
mock_run_api_server_worker)
def test_wait_for_completion_or_failure(api_server_args):
"""Test that wait_for_completion_or_failure works with failures."""
global WORKER_RUNTIME_SECONDS
WORKER_RUNTIME_SECONDS = 1.0

# Create the manager
manager = APIServerProcessManager(**api_server_args)

try:
assert len(manager.processes) == 3

# Create a result capture for the thread
result: dict[str, Optional[Exception]] = {"exception": None}

def run_with_exception_capture():
try:
wait_for_completion_or_failure(api_server_manager=manager)
except Exception as e:
result["exception"] = e

# Start a thread to run wait_for_completion_or_failure
wait_thread = threading.Thread(target=run_with_exception_capture,
daemon=True)
wait_thread.start()

# Let all processes run for a short time
time.sleep(0.2)

# All processes should still be running
assert all(proc.is_alive() for proc in manager.processes)

# Now simulate a process failure
print("Simulating process failure...")
manager.processes[0].terminate()

# Wait for the wait_for_completion_or_failure
# to detect and handle the failure
# This should trigger it to terminate all other processes
wait_thread.join(timeout=1.0)

# The wait thread should have exited
assert not wait_thread.is_alive()

# Verify that an exception was raised with appropriate error message
assert result["exception"] is not None
assert "died with exit code" in str(result["exception"])

# All processes should now be terminated
for i, proc in enumerate(manager.processes):
assert not proc.is_alive(), f"Process {i} should not be alive"

finally:
manager.close()
time.sleep(0.2)


@pytest.mark.timeout(30)
def test_normal_completion(api_server_args):
"""Test that wait_for_completion_or_failure works in normal completion."""
global WORKER_RUNTIME_SECONDS
WORKER_RUNTIME_SECONDS = 0.1

# Create the manager
manager = APIServerProcessManager(**api_server_args)

try:
# Give processes time to terminate
# wait for processes to complete
remaining_processes = manager.processes.copy()
while remaining_processes:
for proc in remaining_processes:
if not proc.is_alive():
remaining_processes.remove(proc)
time.sleep(0.1)

# Verify all processes have terminated
for i, proc in enumerate(manager.processes):
assert not proc.is_alive(
), f"Process {i} still alive after terminate()"

# Now call wait_for_completion_or_failure
# since all processes have already
# terminated, it should return immediately
# with no error
wait_for_completion_or_failure(api_server_manager=manager)

finally:
# Clean up just in case
manager.close()
time.sleep(0.2)


@pytest.mark.timeout(30)
def test_external_process_monitoring(api_server_args):
"""Test that wait_for_completion_or_failure handles additional processes."""
global WORKER_RUNTIME_SECONDS
WORKER_RUNTIME_SECONDS = 100

# Create and start the external process
# (simulates local_engine_manager or coordinator)
spawn_context = multiprocessing.get_context("spawn")
external_proc = spawn_context.Process(target=mock_run_api_server_worker,
name="MockExternalProcess")
external_proc.start()

# Create the class to simulate a coordinator
class MockCoordinator:

def __init__(self, proc):
self.proc = proc

def close(self):
if self.proc.is_alive():
self.proc.terminate()
self.proc.join(timeout=0.5)

# Create a mock coordinator with the external process
mock_coordinator = MockCoordinator(external_proc)

# Create the API server manager
manager = APIServerProcessManager(**api_server_args)

try:
# Verify manager initialization
assert len(manager.processes) == 3

# Create a result capture for the thread
result: dict[str, Optional[Exception]] = {"exception": None}

def run_with_exception_capture():
try:
wait_for_completion_or_failure(api_server_manager=manager,
coordinator=mock_coordinator)
except Exception as e:
result["exception"] = e

# Start a thread to run wait_for_completion_or_failure
wait_thread = threading.Thread(target=run_with_exception_capture,
daemon=True)
wait_thread.start()

# Terminate the external process to trigger a failure
time.sleep(0.2)
external_proc.terminate()

# Wait for the thread to detect the failure
wait_thread.join(timeout=1.0)

# The wait thread should have completed
assert not wait_thread.is_alive(
), "wait_for_completion_or_failure thread still running"

# Verify that an exception was raised with appropriate error message
assert result["exception"] is not None, "No exception was raised"
error_message = str(result["exception"])
assert "died with exit code" in error_message, \
f"Unexpected error message: {error_message}"
assert "MockExternalProcess" in error_message, \
f"Error doesn't mention external process: {error_message}"

# Verify that all API server processes were terminated as a result
for i, proc in enumerate(manager.processes):
assert not proc.is_alive(
), f"API server process {i} was not terminated"

finally:
# Clean up
manager.close()
mock_coordinator.close()
time.sleep(0.2)
1 change: 0 additions & 1 deletion tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def make_request(request_id,
multi_modal_placeholders=mm_positions,
sampling_params=SamplingParams(max_tokens=17),
eos_token_id=100,
arrival_time=0,
lora_request=None,
cache_salt=cache_salt,
)
Expand Down
1 change: 0 additions & 1 deletion tests/v1/core/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def make_request(request_id,
sampling_params=SamplingParams(max_tokens=17,
prompt_logprobs=prompt_logprobs),
eos_token_id=100,
arrival_time=0,
lora_request=None,
cache_salt=cache_salt,
)
Expand Down
3 changes: 1 addition & 2 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def create_requests(num_requests: int,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
eos_token_id=EOS_TOKEN_ID,
arrival_time=0,
)
requests.append(request)
return requests
Expand Down Expand Up @@ -732,7 +731,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
prompt_logprobs_dict={},
)
engine_core_outputs = scheduler.update_from_output(output,
model_runner_output)
model_runner_output)[0]

for i in range(len(requests)):
running_req = scheduler.running[i]
Expand Down
16 changes: 12 additions & 4 deletions tests/v1/test_async_llm_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
model="ibm-research/PowerMoE-3b",
enforce_eager=True,
disable_log_requests=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
tensor_parallel_size=int(os.getenv("TP_SIZE", 2)),
data_parallel_size=int(os.getenv("DP_SIZE", 2)),
data_parallel_address="172.31.15.128",
)

if not current_platform.supports_v1(engine_args.create_model_config()):
Expand Down Expand Up @@ -59,14 +60,22 @@ async def generate(engine: AsyncLLM,


@pytest.mark.parametrize(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
"output_kind",
[
RequestOutputKind.DELTA,
# RequestOutputKind.FINAL_ONLY,
],
)
@pytest.mark.parametrize("data_parallel_backend", ["ray"])
@pytest.mark.asyncio
async def test_load(output_kind: RequestOutputKind):
async def test_load(output_kind: RequestOutputKind,
data_parallel_backend: str):

with ExitStack() as after:

prompt = "This is a test of data parallel"

engine_args.data_parallel_backend = data_parallel_backend
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)

Expand All @@ -82,7 +91,6 @@ async def test_load(output_kind: RequestOutputKind):
asyncio.create_task(
generate(engine, request_id, prompt, output_kind,
NUM_EXPECTED_TOKENS)))

# Confirm that we got all the EXPECTED tokens from the requests.
done, pending = await asyncio.wait(tasks,
return_when=asyncio.FIRST_EXCEPTION)
Expand Down
6 changes: 6 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1693,6 +1693,8 @@ class ParallelConfig:
"""Port for data parallel messaging."""
data_parallel_master_port: int = 29500
"""Port of the data parallel master."""
data_parallel_backend: str = "mp"
"""Backend to use for data parallel, either "mp" or "ray"."""
enable_expert_parallel: bool = False
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
max_parallel_loading_workers: Optional[int] = None
Expand Down Expand Up @@ -1856,6 +1858,10 @@ def __post_init__(self) -> None:
"please install Ray with `pip install "
"ray`.") from ray_utils.ray_import_err
backend = "ray"
elif self.data_parallel_backend == "ray":
logger.info("Using ray distributed inference because "
"data_parallel_backend is ray")
backend = "ray"
elif ray_found:
if self.placement_group:
backend = "ray"
Expand Down
Loading