Skip to content

Commit 9194383

Browse files
njhillsahilsuneja1
authored andcommitted
Changes pending from vllm-project/vllm#2898
Co-authored-by: Sahil Suneja <[email protected]> Signed-off-by: Joe Runde <[email protected]>
1 parent 790513a commit 9194383

16 files changed

+431
-81
lines changed

Dockerfile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ ADD . /vllm-workspace/
7070
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
7171
# ignore build dependencies installation because we are using pre-complied extensions
7272
RUN rm pyproject.toml
73-
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
73+
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install .[ray] --verbose
7474
#################### TEST IMAGE ####################
7575

7676

@@ -80,7 +80,6 @@ RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip instal
8080
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
8181
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
8282

83-
# libnccl required for ray
8483
RUN apt-get update -y \
8584
&& apt-get install -y python3-pip
8685

requirements-rocm.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ ninja # For faster builds.
22
typing-extensions>=4.8.0
33
starlette
44
psutil
5-
ray >= 2.9
65
sentencepiece # Required for LLaMA tokenizer.
76
numpy
87
tokenizers>=0.15.0

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
ninja # For faster builds.
22
psutil
3-
ray >= 2.9
43
sentencepiece # Required for LLaMA tokenizer.
54
numpy
65
torch == 2.1.2

setup.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@
66
import sys
77
import warnings
88
from pathlib import Path
9-
from typing import List, Set
9+
from typing import List, Set, Dict
1010

1111
from packaging.version import parse, Version
1212
import setuptools
1313
import torch
1414
import torch.utils.cpp_extension as torch_cpp_ext
1515
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
16+
from typing import Optional
1617

1718
ROOT_DIR = os.path.dirname(__file__)
1819
# This is a temporary directory to store third-party packages.
@@ -475,6 +476,12 @@ def get_requirements() -> List[str]:
475476
return requirements
476477

477478

479+
def get_ray_requirement() -> Optional[Dict[str, List[str]]]:
480+
if _is_neuron():
481+
return None
482+
return {"ray": ["ray >= 2.9"]}
483+
484+
478485
package_data = {
479486
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
480487
}
@@ -508,6 +515,7 @@ def get_requirements() -> List[str]:
508515
"examples", "tests")),
509516
python_requires=">=3.8",
510517
install_requires=get_requirements(),
518+
extras_requires=get_ray_requirement(),
511519
ext_modules=ext_modules,
512520
cmdclass={"build_ext": build_ext} if not _is_neuron() else {},
513521
distclass=BinaryDistribution,

tests/engine/test_local_worker.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import pytest
2+
import torch
3+
import multiprocessing as mp
4+
from vllm import LLM, SamplingParams
5+
6+
TENSOR_PARALLEL_SIZE = 2
7+
MAX_GENERATION_TOKENS = 256
8+
9+
10+
def llm_generate(result_queue, prompt_token_ids, worker_use_ray=False):
11+
try:
12+
llm = LLM(model="facebook/opt-350m",
13+
tensor_parallel_size=TENSOR_PARALLEL_SIZE,
14+
worker_use_ray=worker_use_ray)
15+
16+
output = llm.generate(
17+
prompt_token_ids=prompt_token_ids,
18+
sampling_params=SamplingParams(max_tokens=MAX_GENERATION_TOKENS))
19+
except BaseException as e:
20+
output = e
21+
22+
result_queue.put(output)
23+
24+
25+
def run_llm(prompt_token_ids, worker_use_ray=False):
26+
result_queue = mp.Queue()
27+
proc = mp.Process(target=llm_generate,
28+
args=(result_queue, prompt_token_ids, worker_use_ray))
29+
proc.start()
30+
result = result_queue.get()
31+
proc.join()
32+
if isinstance(result, BaseException):
33+
raise result
34+
return result
35+
36+
37+
def get_prompts():
38+
# https://github.com/vllm-project/vllm/issues/367#issuecomment-1629872996
39+
batch_size = 32
40+
dim = 120
41+
max_token_id = 32000
42+
torch.manual_seed(42)
43+
batch = torch.randint(max_token_id, (batch_size, dim))
44+
prompt_token_ids = [tokens.tolist() for tokens in batch]
45+
return prompt_token_ids
46+
47+
48+
@pytest.mark.skip("Requires multiple GPUs")
49+
def test_local_worker():
50+
# Similar to tests/lora/test_llama.py
51+
# Cannot use as it will initialize torch.cuda too early...
52+
# if torch.cuda.device_count() < 2:
53+
# pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
54+
55+
prompt_token_ids = get_prompts()
56+
output1 = run_llm(prompt_token_ids, worker_use_ray=False)
57+
output2 = run_llm(prompt_token_ids, worker_use_ray=True)
58+
assert len(output1) == len(output2)
59+
60+
completion_token_ids1 = [item.outputs[0].token_ids for item in output1]
61+
completion_token_ids2 = [item.outputs[0].token_ids for item in output2]
62+
assert completion_token_ids1 == completion_token_ids2
63+
64+
65+
if __name__ == "__main__":
66+
test_local_worker()

vllm/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def _configure_system():
1919
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
2020
from vllm.engine.async_llm_engine import AsyncLLMEngine # noqa: E402
2121
from vllm.engine.llm_engine import LLMEngine # noqa: E402
22-
from vllm.engine.ray_utils import initialize_cluster # noqa: E402
22+
from vllm.engine.ray_utils import initialize_ray_cluster # noqa: E402
2323
from vllm.entrypoints.llm import LLM # noqa: E402
2424
from vllm.outputs import CompletionOutput, RequestOutput # noqa: E402
2525
from vllm.sampling_params import SamplingParams # noqa: E402
@@ -35,5 +35,5 @@ def _configure_system():
3535
"EngineArgs",
3636
"AsyncLLMEngine",
3737
"AsyncEngineArgs",
38-
"initialize_cluster",
38+
"initialize_ray_cluster",
3939
]

vllm/config.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import importlib.util
12
from typing import Optional, Union, ClassVar
23
from dataclasses import dataclass
34
import os
@@ -376,9 +377,9 @@ class ParallelConfig:
376377
Args:
377378
pipeline_parallel_size: Number of pipeline parallel groups.
378379
tensor_parallel_size: Number of tensor parallel groups.
379-
worker_use_ray: Whether to use Ray for model workers. Will be set to
380+
worker_use_ray: Whether to use Ray for model workers. Will default to
380381
True if either pipeline_parallel_size or tensor_parallel_size is
381-
greater than 1.
382+
greater than 1 and Ray is installed.
382383
max_parallel_loading_workers: Maximum number of multiple batches
383384
when load model sequentially. To avoid RAM OOM when using tensor
384385
parallel and large models.
@@ -392,7 +393,7 @@ def __init__(
392393
self,
393394
pipeline_parallel_size: int,
394395
tensor_parallel_size: int,
395-
worker_use_ray: bool,
396+
worker_use_ray: Optional[bool] = None,
396397
max_parallel_loading_workers: Optional[int] = None,
397398
disable_custom_all_reduce: bool = False,
398399
ray_workers_use_nsight: bool = False,
@@ -412,9 +413,10 @@ def __init__(
412413
self.ray_workers_use_nsight = ray_workers_use_nsight
413414

414415
self.world_size = pipeline_parallel_size * self.tensor_parallel_size
415-
# Ray worker is not supported for Neuron backend.
416-
if self.world_size > 1 and not is_neuron():
417-
self.worker_use_ray = True
416+
if self.worker_use_ray is None:
417+
ray_found = importlib.util.find_spec("ray") is not None
418+
self.worker_use_ray = ray_found and self.world_size > 1
419+
418420
self._verify_args()
419421

420422
def _verify_args(self) -> None:
@@ -498,12 +500,12 @@ class DeviceConfig:
498500
def __init__(self, device: str = "auto") -> None:
499501
if device == "auto":
500502
# Automated device type detection
501-
if torch.cuda.is_available():
502-
self.device_type = "cuda"
503-
elif is_neuron():
503+
if is_neuron():
504504
self.device_type = "neuron"
505505
else:
506-
raise RuntimeError("No supported device detected.")
506+
# We don't call torch.cuda.is_available() here to
507+
# avoid initializing CUDA before workers are forked
508+
self.device_type = "cuda"
507509
else:
508510
# Device type is assigned explicitly
509511
self.device_type = device

vllm/engine/arg_utils.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class EngineArgs:
2020
kv_cache_dtype: str = 'auto'
2121
seed: int = 0
2222
max_model_len: Optional[int] = None
23-
worker_use_ray: bool = False
23+
worker_use_ray: Optional[bool] = None
2424
pipeline_parallel_size: int = 1
2525
tensor_parallel_size: int = 1
2626
max_parallel_loading_workers: Optional[int] = None
@@ -149,10 +149,12 @@ def add_cli_args(
149149
help='model context length. If unspecified, '
150150
'will be automatically derived from the model.')
151151
# Parallel arguments
152-
parser.add_argument('--worker-use-ray',
153-
action='store_true',
154-
help='use Ray for distributed serving, will be '
155-
'automatically set when using more than 1 GPU')
152+
parser.add_argument(
153+
'--worker-use-ray',
154+
action=argparse.BooleanOptionalAction,
155+
default=None,
156+
help='use Ray for distributed serving, will default '
157+
'to true when ray is installed and more than 1 GPU is used')
156158
parser.add_argument('--pipeline-parallel-size',
157159
'-pp',
158160
type=int,

vllm/engine/async_llm_engine.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from vllm.config import ModelConfig
1010
from vllm.engine.arg_utils import AsyncEngineArgs
1111
from vllm.engine.llm_engine import LLMEngine
12-
from vllm.engine.ray_utils import initialize_cluster, ray
12+
from vllm.engine.ray_utils import initialize_ray_cluster, ray
1313
from vllm.logger import init_logger
1414
from vllm.outputs import RequestOutput
1515
from vllm.sampling_params import SamplingParams
@@ -287,9 +287,15 @@ async def _run_workers_async(
287287
coros.append(asyncio.get_event_loop().run_in_executor(
288288
None, partial(driver_executor, *driver_args, **driver_kwargs)))
289289

290-
# Run the ray workers asynchronously.
291-
for worker in self.workers:
292-
coros.append(worker.execute_method.remote(method, *args, **kwargs))
290+
# Run the workers asynchronously.
291+
if self.parallel_config.worker_use_ray:
292+
for worker in self.workers:
293+
coros.append(
294+
worker.execute_method.remote(method, *args, **kwargs))
295+
else:
296+
for worker in self.workers:
297+
coros.append(
298+
worker.execute_method_async(method, *args, **kwargs))
293299

294300
all_outputs = await asyncio.gather(*coros)
295301
return all_outputs
@@ -674,8 +680,8 @@ def from_engine_args(cls,
674680
engine_configs = engine_args.create_engine_configs()
675681
parallel_config = engine_configs[2]
676682
# Initialize the cluster.
677-
placement_group = initialize_cluster(parallel_config,
678-
engine_args.engine_use_ray)
683+
placement_group = initialize_ray_cluster(parallel_config,
684+
engine_args.engine_use_ray)
679685
# Create the async LLM engine.
680686
engine = cls(parallel_config.worker_use_ray,
681687
engine_args.engine_use_ray,

0 commit comments

Comments
 (0)