Skip to content

Commit a491d6f

Browse files
authoredDec 23, 2024··
[V1] TP Ray executor (#11107)
Signed-off-by: Rui Qiao <[email protected]>
1 parent 32aa205 commit a491d6f

File tree

5 files changed

+617
-3
lines changed

5 files changed

+617
-3
lines changed
 

‎tests/basic_correctness/test_basic_correctness.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def test_models_distributed(
130130
# Import VLLM_USE_V1 dynamically to handle patching
131131
from vllm.envs import VLLM_USE_V1
132132
if VLLM_USE_V1 and distributed_executor_backend != "mp":
133-
pytest.skip(f"Skip {distributed_executor_backend} for V1")
133+
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
134134

135135
dtype = "half"
136136
max_tokens = 5

‎vllm/v1/engine/llm_engine.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from vllm.v1.engine.detokenizer import Detokenizer
2222
from vllm.v1.engine.processor import Processor
2323
from vllm.v1.executor.abstract import Executor
24+
from vllm.v1.executor.ray_utils import initialize_ray_cluster
2425

2526
logger = init_logger(__name__)
2627

@@ -110,7 +111,11 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
110111
executor_class: Type[Executor]
111112
distributed_executor_backend = (
112113
vllm_config.parallel_config.distributed_executor_backend)
113-
if distributed_executor_backend == "mp":
114+
if distributed_executor_backend == "ray":
115+
initialize_ray_cluster(vllm_config.parallel_config)
116+
from vllm.v1.executor.ray_executor import RayExecutor
117+
executor_class = RayExecutor
118+
elif distributed_executor_backend == "mp":
114119
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
115120
executor_class = MultiprocExecutor
116121
else:

‎vllm/v1/executor/ray_executor.py

+339
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
import os
2+
from collections import defaultdict
3+
from itertools import islice, repeat
4+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
5+
6+
import vllm.envs as envs
7+
from vllm.config import VllmConfig
8+
from vllm.logger import init_logger
9+
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
10+
from vllm.v1.executor.abstract import Executor
11+
from vllm.v1.executor.ray_utils import RayWorkerWrapper, ray
12+
from vllm.v1.outputs import ModelRunnerOutput
13+
14+
if ray is not None:
15+
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
16+
17+
if TYPE_CHECKING:
18+
from ray.util.placement_group import PlacementGroup
19+
20+
logger = init_logger(__name__)
21+
22+
23+
class RayExecutor(Executor):
24+
25+
def __init__(self, vllm_config: VllmConfig) -> None:
26+
self.vllm_config = vllm_config
27+
self.parallel_config = vllm_config.parallel_config
28+
self.model_config = vllm_config.model_config
29+
self.forward_dag: Optional[ray.dag.CompiledDAG] = None
30+
31+
# Disable Ray usage stats collection.
32+
ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
33+
if ray_usage != "1":
34+
os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
35+
36+
placement_group = self.parallel_config.placement_group
37+
# Create the parallel GPU workers.
38+
self._init_workers_ray(placement_group)
39+
40+
def _init_workers_ray(self, placement_group: "PlacementGroup",
41+
**ray_remote_kwargs):
42+
# A list of workers to run a model.
43+
self.workers: List[RayWorkerWrapper] = []
44+
if self.parallel_config.ray_workers_use_nsight:
45+
ray_remote_kwargs = self._configure_ray_workers_use_nsight(
46+
ray_remote_kwargs)
47+
48+
# Create the workers.
49+
driver_ip = get_ip()
50+
for bundle_id, bundle in enumerate(placement_group.bundle_specs):
51+
if not bundle.get("GPU", 0):
52+
# Skip bundles that don't have GPUs,
53+
# as each worker needs one GPU.
54+
continue
55+
scheduling_strategy = PlacementGroupSchedulingStrategy(
56+
placement_group=placement_group,
57+
placement_group_capture_child_tasks=True,
58+
placement_group_bundle_index=bundle_id,
59+
)
60+
61+
worker = ray.remote(
62+
num_cpus=0,
63+
num_gpus=1,
64+
scheduling_strategy=scheduling_strategy,
65+
**ray_remote_kwargs,
66+
)(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
67+
self.workers.append(worker)
68+
69+
logger.debug("workers: %s", self.workers)
70+
worker_ips = [
71+
ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined]
72+
for worker in self.workers
73+
]
74+
ip_counts: Dict[str, int] = {}
75+
for ip in worker_ips:
76+
ip_counts[ip] = ip_counts.get(ip, 0) + 1
77+
78+
worker_to_ip = dict(zip(self.workers, worker_ips))
79+
80+
def sort_by_driver_then_worker_ip(worker):
81+
"""
82+
Sort the workers based on 3 properties:
83+
1. If the worker is on the same node as the driver (vllm engine),
84+
it should be placed first.
85+
2. Then, if the worker is on a node with fewer workers, it should
86+
be placed first.
87+
3. Finally, if the work is on a node with smaller IP address, it
88+
should be placed first. This is simply a tiebreaker to make
89+
sure the workers are sorted in a deterministic way.
90+
"""
91+
ip = worker_to_ip[worker]
92+
return (ip != driver_ip, ip_counts[ip], ip)
93+
94+
# After sorting, the workers on the same node will be
95+
# close to each other, and the workers on the driver
96+
# node will be placed first.
97+
self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
98+
99+
# Get the set of GPU IDs used on each node.
100+
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids")
101+
102+
node_workers = defaultdict(list) # node id -> list of worker ranks
103+
node_gpus = defaultdict(list) # node id -> list of gpu ids
104+
105+
for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
106+
node_workers[node_id].append(i)
107+
# `gpu_ids` can be a list of strings or integers.
108+
# convert them to integers for consistency.
109+
# NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
110+
# string sorting is not sufficient.
111+
# see https://github.com/vllm-project/vllm/issues/5590
112+
gpu_ids = [int(x) for x in gpu_ids]
113+
node_gpus[node_id].extend(gpu_ids)
114+
115+
for node_id, gpu_ids in node_gpus.items():
116+
node_gpus[node_id] = sorted(gpu_ids)
117+
118+
all_ips = set(worker_ips)
119+
n_ips = len(all_ips)
120+
n_nodes = len(node_workers)
121+
122+
if n_nodes != n_ips:
123+
raise RuntimeError(
124+
f"Every node should have a unique IP address. Got {n_nodes}"
125+
f" nodes with node ids {list(node_workers.keys())} and "
126+
f"{n_ips} unique IP addresses {all_ips}. Please check your"
127+
" network configuration. If you set `VLLM_HOST_IP` or "
128+
"`HOST_IP` environment variable, make sure it is unique for"
129+
" each node.")
130+
131+
# Set environment variables for the driver and workers.
132+
all_args_to_update_environment_variables = [({
133+
"CUDA_VISIBLE_DEVICES":
134+
",".join(map(str, node_gpus[node_id])),
135+
"VLLM_TRACE_FUNCTION":
136+
str(envs.VLLM_TRACE_FUNCTION),
137+
"VLLM_USE_V1":
138+
str(int(envs.VLLM_USE_V1)),
139+
**({
140+
"VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
141+
} if envs.VLLM_ATTENTION_BACKEND is not None else {})
142+
}, ) for (node_id, _) in worker_node_and_gpu_ids]
143+
144+
self._env_vars_for_all_workers = (
145+
all_args_to_update_environment_variables)
146+
147+
self._run_workers("update_environment_variables",
148+
all_args=self._get_env_vars_to_be_updated())
149+
150+
if len(node_gpus) == 1:
151+
# in single node case, we don't need to get the IP address.
152+
# the loopback address is sufficient
153+
# NOTE: a node may have several IP addresses, one for each
154+
# network interface. `get_ip()` might return any of them,
155+
# while they might not work for communication inside the node
156+
# if the network setup is complicated. Using the loopback address
157+
# solves this issue, as it always works for communication inside
158+
# the node.
159+
driver_ip = "127.0.0.1"
160+
distributed_init_method = get_distributed_init_method(
161+
driver_ip, get_open_port())
162+
163+
# Initialize the actual workers inside worker wrapper.
164+
init_worker_all_kwargs = [
165+
self._get_worker_kwargs(
166+
local_rank=node_workers[node_id].index(rank),
167+
rank=rank,
168+
distributed_init_method=distributed_init_method,
169+
) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
170+
]
171+
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
172+
self._run_workers("initialize")
173+
self._run_workers("load_model")
174+
175+
def _configure_ray_workers_use_nsight(self,
176+
ray_remote_kwargs) -> Dict[str, Any]:
177+
# If nsight profiling is enabled, we need to set the profiling
178+
# configuration for the ray workers as runtime env.
179+
runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
180+
runtime_env.update({
181+
"nsight": {
182+
"t": "cuda,cudnn,cublas",
183+
"o": "'worker_process_%p'",
184+
"cuda-graph-trace": "node",
185+
}
186+
})
187+
188+
return ray_remote_kwargs
189+
190+
def _get_env_vars_to_be_updated(self):
191+
return self._env_vars_for_all_workers
192+
193+
def _get_worker_kwargs(
194+
self,
195+
local_rank: int = 0,
196+
rank: int = 0,
197+
distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
198+
"""
199+
Return worker init args for a given rank.
200+
"""
201+
if distributed_init_method is None:
202+
distributed_init_method = get_distributed_init_method(
203+
get_ip(), get_open_port())
204+
return dict(
205+
vllm_config=self.vllm_config,
206+
local_rank=local_rank,
207+
rank=rank,
208+
distributed_init_method=distributed_init_method,
209+
)
210+
211+
def determine_num_available_blocks(self) -> Tuple[int, int]:
212+
"""
213+
Determine the number of available KV blocks.
214+
215+
This invokes `determine_num_available_blocks` on each worker and takes
216+
the min of the results, guaranteeing that the selected cache sizes are
217+
compatible with all workers.
218+
219+
Returns:
220+
- tuple[num_gpu_blocks, num_cpu_blocks]
221+
"""
222+
# Get the maximum number of blocks that can be allocated on GPU and CPU.
223+
num_blocks = self._run_workers("determine_num_available_blocks")
224+
225+
# Since we use a shared centralized controller, we take the minimum
226+
# number of blocks across all workers to make sure all the memory
227+
# operators can be applied to all workers.
228+
num_gpu_blocks = min(b[0] for b in num_blocks)
229+
num_cpu_blocks = min(b[1] for b in num_blocks)
230+
231+
return num_gpu_blocks, num_cpu_blocks
232+
233+
def initialize(self, num_gpu_blocks: int) -> None:
234+
"""
235+
Initialize the KV cache in all workers.
236+
"""
237+
# NOTE: This is logged in the executor because there can be >1 worker
238+
# with other executors. We could log in the engine level, but work
239+
# remains to abstract away the device for non-GPU configurations.
240+
logger.info("# GPU blocks: %d", num_gpu_blocks)
241+
self._run_workers("initialize_cache", num_gpu_blocks)
242+
self._run_workers("compile_or_warm_up_model")
243+
244+
def _run_workers(
245+
self,
246+
method: str,
247+
*args,
248+
all_args: Optional[List[Tuple[Any, ...]]] = None,
249+
all_kwargs: Optional[List[Dict[str, Any]]] = None,
250+
**kwargs,
251+
) -> Any:
252+
"""
253+
Runs the given method on all workers. Can be used in the following
254+
ways:
255+
256+
Args:
257+
- args/kwargs: All workers share the same args/kwargs
258+
- all_args/all_kwargs: args/kwargs for each worker are specified
259+
individually
260+
"""
261+
count = len(self.workers)
262+
all_worker_args = repeat(args, count) if all_args is None \
263+
else islice(all_args, 0, None)
264+
all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
265+
else islice(all_kwargs, 0, None)
266+
267+
ray_worker_refs = [
268+
worker.execute_method.remote( # type: ignore[attr-defined]
269+
method, *worker_args, **worker_kwargs)
270+
for (worker, worker_args, worker_kwargs
271+
) in zip(self.workers, all_worker_args, all_worker_kwargs)
272+
]
273+
return ray.get(ray_worker_refs)
274+
275+
def execute_model(
276+
self,
277+
scheduler_output,
278+
) -> ModelRunnerOutput:
279+
if self.forward_dag is None:
280+
self.forward_dag = self._compiled_ray_dag()
281+
# Only the first worker (with rank 0) returns the execution result.
282+
# Others return None.
283+
output = ray.get(self.forward_dag.execute(scheduler_output))[0]
284+
return output
285+
286+
def profile(self, is_start=True):
287+
raise NotImplementedError
288+
289+
def shutdown(self):
290+
if hasattr(self, "forward_dag") and self.forward_dag is not None:
291+
self.forward_dag.teardown()
292+
import ray
293+
for worker in self.workers:
294+
ray.kill(worker)
295+
self.forward_dag = None
296+
297+
def check_health(self) -> None:
298+
logger.debug("Called check_health.")
299+
300+
def _check_ray_compiled_graph_installation(self):
301+
import pkg_resources
302+
from packaging import version
303+
304+
required_version = version.parse("2.39")
305+
current_version = version.parse(
306+
pkg_resources.get_distribution("ray").version)
307+
if current_version < required_version:
308+
raise ValueError(f"Ray version {required_version} is "
309+
f"required, but found {current_version}")
310+
311+
import importlib.util
312+
raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref")
313+
if raycg is None:
314+
raise ValueError("Ray Compiled Graph is not installed. "
315+
"Run `pip install ray[adag]` to install it.")
316+
317+
cupy_spec = importlib.util.find_spec("cupy")
318+
if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
319+
raise ValueError(
320+
"cupy is not installed but required since "
321+
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
322+
"Run `pip install ray[adag]` and check cupy installation.")
323+
324+
def _compiled_ray_dag(self):
325+
assert self.parallel_config.use_ray
326+
self._check_ray_compiled_graph_installation()
327+
from ray.dag import InputNode, MultiOutputNode
328+
329+
with InputNode() as input_batches:
330+
outputs = [
331+
worker.execute_model.bind( # type: ignore[attr-defined]
332+
input_batches) for worker in self.workers
333+
]
334+
forward_dag = MultiOutputNode(outputs)
335+
336+
return forward_dag.experimental_compile()
337+
338+
def __del__(self):
339+
self.shutdown()

‎vllm/v1/executor/ray_utils.py

+271
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
import time
2+
from collections import defaultdict
3+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
4+
5+
from vllm.config import ParallelConfig
6+
from vllm.logger import init_logger
7+
from vllm.platforms import current_platform
8+
from vllm.utils import get_ip
9+
from vllm.v1.outputs import ModelRunnerOutput
10+
from vllm.worker.worker_base import WorkerWrapperBase
11+
12+
if TYPE_CHECKING:
13+
from vllm.v1.core.scheduler import SchedulerOutput
14+
15+
logger = init_logger(__name__)
16+
PG_WAIT_TIMEOUT = 60
17+
18+
try:
19+
import ray
20+
from ray.util import placement_group_table
21+
from ray.util.placement_group import PlacementGroup
22+
try:
23+
from ray._private.state import available_resources_per_node
24+
except ImportError:
25+
# Ray 2.9.x doesn't expose `available_resources_per_node`
26+
from ray._private.state import state as _state
27+
available_resources_per_node = _state._available_resources_per_node
28+
29+
class RayWorkerWrapper(WorkerWrapperBase):
30+
31+
def __init__(self, *args, **kwargs) -> None:
32+
super().__init__(*args, **kwargs)
33+
# Since the compiled DAG runs a main execution
34+
# in a different thread that calls cuda.set_device.
35+
# The flag indicates is set_device is called on
36+
# that thread. It will be removed soon.
37+
self.compiled_dag_cuda_device_set = False
38+
39+
def get_node_ip(self) -> str:
40+
return get_ip()
41+
42+
def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
43+
node_id = ray.get_runtime_context().get_node_id()
44+
gpu_ids = ray.get_gpu_ids()
45+
return node_id, gpu_ids
46+
47+
def setup_device_if_necessary(self):
48+
# TODO(swang): This is needed right now because Ray CG executes
49+
# on a background thread, so we need to reset torch's current
50+
# device.
51+
# We can remove this API after it is fixed in compiled graph.
52+
import torch
53+
assert self.worker is not None, "Worker is not initialized"
54+
if not self.compiled_dag_cuda_device_set:
55+
torch.cuda.set_device(self.worker.device)
56+
self.compiled_dag_cuda_device_set = True
57+
58+
def execute_model(
59+
self,
60+
scheduler_output: "SchedulerOutput",
61+
) -> ModelRunnerOutput:
62+
self.setup_device_if_necessary()
63+
assert self.worker is not None, "Worker is not initialized"
64+
output = self.worker.model_runner.execute_model(scheduler_output)
65+
return output
66+
67+
ray_import_err = None
68+
69+
except ImportError as e:
70+
ray = None # type: ignore
71+
ray_import_err = e
72+
RayWorkerWrapper = None # type: ignore
73+
74+
75+
def ray_is_available() -> bool:
76+
"""Returns True if Ray is available."""
77+
return ray is not None
78+
79+
80+
def assert_ray_available():
81+
"""
82+
Raise an exception if Ray is not available.
83+
"""
84+
if ray is None:
85+
raise ValueError("Failed to import Ray, please install Ray with "
86+
"`pip install ray`.") from ray_import_err
87+
88+
89+
def _verify_bundles(placement_group: "PlacementGroup",
90+
parallel_config: ParallelConfig, device_str: str):
91+
"""
92+
Verify a given placement group has bundles located in the right place.
93+
94+
There are 2 rules.
95+
- Warn if all tensor parallel workers cannot fit in a single node.
96+
- Fail if driver node is not included in a placement group.
97+
98+
Args:
99+
placement_group: The placement group to verify.
100+
parallel_config: The parallel configuration.
101+
device_str: The required device.
102+
"""
103+
assert ray.is_initialized(), (
104+
"Ray is not initialized although distributed-executor-backend is ray.")
105+
pg_data = placement_group_table(placement_group)
106+
# bundle_idx -> node_id
107+
bundle_to_node_ids = pg_data["bundles_to_node_id"]
108+
# bundle_idx -> bundle (e.g., {"GPU": 1})
109+
bundles = pg_data["bundles"]
110+
# node_id -> List of bundle (e.g., {"GPU": 1})
111+
node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
112+
113+
for bundle_idx, node_id in bundle_to_node_ids.items():
114+
node_id_to_bundle[node_id].append(bundles[bundle_idx])
115+
driver_node_id = ray.get_runtime_context().get_node_id()
116+
117+
if driver_node_id not in node_id_to_bundle:
118+
raise RuntimeError(
119+
f"driver node id {driver_node_id} is not included in a placement "
120+
f"group {placement_group.id}. Node id -> bundles "
121+
f"{node_id_to_bundle}. "
122+
"You don't have enough GPUs available in a current node. Check "
123+
"`ray status` to see if you have available GPUs in a node "
124+
f"{driver_node_id} before starting an vLLM engine.")
125+
126+
for node_id, bundles in node_id_to_bundle.items():
127+
if len(bundles) < parallel_config.tensor_parallel_size:
128+
logger.warning(
129+
"tensor_parallel_size=%d "
130+
"is bigger than a reserved number of %ss (%d "
131+
"%ss) in a node %s. Tensor parallel workers can be "
132+
"spread out to 2+ nodes which can degrade the performance "
133+
"unless you have fast interconnect across nodes, like "
134+
"Infiniband. To resolve this issue, make sure you have more "
135+
"than %d GPUs available at each node.",
136+
parallel_config.tensor_parallel_size, device_str, len(bundles),
137+
device_str, node_id, parallel_config.tensor_parallel_size)
138+
139+
140+
def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
141+
"""Wait until a placement group is ready.
142+
143+
It prints the informative log messages if the placement group is
144+
not created within time.
145+
146+
"""
147+
# Wait until PG is ready - this will block until all
148+
# requested resources are available, and will timeout
149+
# if they cannot be provisioned.
150+
placement_group_specs = current_placement_group.bundle_specs
151+
152+
s = time.time()
153+
pg_ready_ref = current_placement_group.ready()
154+
wait_interval = 10
155+
while time.time() - s < PG_WAIT_TIMEOUT:
156+
ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
157+
if len(ready) > 0:
158+
break
159+
160+
# Exponential backoff for warning print.
161+
wait_interval *= 2
162+
logger.info(
163+
"Waiting for creating a placement group of specs for "
164+
"%d seconds. specs=%s. Check "
165+
"`ray status` to see if you have enough resources.",
166+
int(time.time() - s), placement_group_specs)
167+
168+
try:
169+
ray.get(pg_ready_ref, timeout=0)
170+
except ray.exceptions.GetTimeoutError:
171+
raise ValueError(
172+
"Cannot provide a placement group of "
173+
f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
174+
"`ray status` to make sure the cluster has enough resources."
175+
) from None
176+
177+
178+
def initialize_ray_cluster(
179+
parallel_config: ParallelConfig,
180+
ray_address: Optional[str] = None,
181+
):
182+
"""Initialize the distributed cluster with Ray.
183+
184+
it will connect to the Ray cluster and create a placement group
185+
for the workers, which includes the specification of the resources
186+
for each distributed worker.
187+
188+
Args:
189+
parallel_config: The configurations for parallel execution.
190+
ray_address: The address of the Ray cluster. If None, uses
191+
the default Ray cluster address.
192+
"""
193+
assert_ray_available()
194+
195+
# Connect to a ray cluster.
196+
if current_platform.is_rocm() or current_platform.is_xpu():
197+
# Try to connect existing ray instance and create a new one if not found
198+
try:
199+
ray.init("auto")
200+
except ConnectionError:
201+
logger.warning(
202+
"No existing RAY instance detected. "
203+
"A new instance will be launched with current node resources.")
204+
ray.init(address=ray_address,
205+
ignore_reinit_error=True,
206+
num_gpus=parallel_config.world_size)
207+
else:
208+
ray.init(address=ray_address, ignore_reinit_error=True)
209+
210+
if parallel_config.placement_group:
211+
# Placement group is already set.
212+
return
213+
214+
device_str = "GPU" if not current_platform.is_tpu() else "TPU"
215+
# Create placement group for worker processes
216+
current_placement_group = ray.util.get_current_placement_group()
217+
if current_placement_group:
218+
# We are in a placement group
219+
bundles = current_placement_group.bundle_specs
220+
# Verify that we can use the placement group.
221+
device_bundles = 0
222+
for bundle in bundles:
223+
bundle_devices = bundle.get(device_str, 0)
224+
if bundle_devices > 1:
225+
raise ValueError(
226+
"Placement group bundle cannot have more than 1 "
227+
f"{device_str}.")
228+
if bundle_devices:
229+
device_bundles += 1
230+
if parallel_config.world_size > device_bundles:
231+
raise ValueError(
232+
f"The number of required {device_str}s exceeds the total "
233+
f"number of available {device_str}s in the placement group."
234+
f"Required number of devices: {parallel_config.world_size}. "
235+
f"Total number of devices: {device_bundles}.")
236+
else:
237+
num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
238+
if parallel_config.world_size > num_devices_in_cluster:
239+
raise ValueError(
240+
f"The number of required {device_str}s exceeds the total "
241+
f"number of available {device_str}s in the placement group.")
242+
# Create a new placement group
243+
placement_group_specs: List[Dict[str, float]] = ([{
244+
device_str: 1.0
245+
} for _ in range(parallel_config.world_size)])
246+
247+
# vLLM engine is also a worker to execute model with an accelerator,
248+
# so it requires to have the device in a current node. Check if
249+
# the current node has at least one device.
250+
current_ip = get_ip()
251+
current_node_id = ray.get_runtime_context().get_node_id()
252+
current_node_resource = available_resources_per_node()[current_node_id]
253+
if current_node_resource.get(device_str, 0) < 1:
254+
raise ValueError(
255+
f"Current node has no {device_str} available. "
256+
f"{current_node_resource=}. vLLM engine cannot start without "
257+
f"{device_str}. Make sure you have at least 1 {device_str} "
258+
f"available in a node {current_node_id=} {current_ip=}.")
259+
# This way, at least bundle is required to be created in a current
260+
# node.
261+
placement_group_specs[0][f"node:{current_ip}"] = 0.001
262+
263+
# By default, Ray packs resources as much as possible.
264+
current_placement_group = ray.util.placement_group(
265+
placement_group_specs, strategy="PACK")
266+
_wait_until_pg_ready(current_placement_group)
267+
268+
assert current_placement_group is not None
269+
_verify_bundles(current_placement_group, parallel_config, device_str)
270+
# Set the placement group in the parallel config
271+
parallel_config.placement_group = current_placement_group

‎vllm/v1/worker/gpu_worker.py

-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,6 @@ def execute_model(
202202
) -> ModelRunnerOutput:
203203
output = self.model_runner.execute_model(scheduler_output)
204204
return output if self.rank == 0 else None
205-
return output
206205

207206
def profile(self, is_start: bool = True):
208207
if self.profiler is None:

0 commit comments

Comments
 (0)
Please sign in to comment.