IBM
diff --git a/‎Dockerfile
Lines changed: 1 addition & 2 deletions b/‎Dockerfile
Lines changed: 1 addition & 2 deletions
diff --git a/‎requirements-rocm.txt
Lines changed: 0 additions & 1 deletion b/‎requirements-rocm.txt
Lines changed: 0 additions & 1 deletion
diff --git a/‎requirements.txt
Lines changed: 0 additions & 1 deletion b/‎requirements.txt
Lines changed: 0 additions & 1 deletion
diff --git a/‎setup.py
Lines changed: 9 additions & 1 deletion b/‎setup.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎tests/engine/test_local_worker.py
Lines changed: 66 additions & 0 deletions b/‎tests/engine/test_local_worker.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎vllm/__init__.py
Lines changed: 2 additions & 2 deletions b/‎vllm/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/config.py
Lines changed: 12 additions & 10 deletions b/‎vllm/config.py
Lines changed: 12 additions & 10 deletions
diff --git a/‎vllm/engine/arg_utils.py
Lines changed: 7 additions & 5 deletions b/‎vllm/engine/arg_utils.py
Lines changed: 7 additions & 5 deletions
diff --git a/‎vllm/engine/async_llm_engine.py
Lines changed: 12 additions & 6 deletions b/‎vllm/engine/async_llm_engine.py
Lines changed: 12 additions & 6 deletions
@@ -70,7 +70,7 @@ ADD . /vllm-workspace/
 COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
-RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
+RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install .[ray] --verbose
 #################### TEST IMAGE ####################
 
 
@@ -80,7 +80,6 @@ RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip instal
 # In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
 FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
 
-# libnccl required for ray
 RUN apt-get update -y \
     && apt-get install -y python3-pip
 
 
@@ -2,7 +2,6 @@ ninja  # For faster builds.
 typing-extensions>=4.8.0
 starlette
 psutil
-ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0
 
@@ -1,6 +1,5 @@
 ninja  # For faster builds.
 psutil
-ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2
 
@@ -6,13 +6,14 @@
 import sys
 import warnings
 from pathlib import Path
-from typing import List, Set
+from typing import List, Set, Dict
 
 from packaging.version import parse, Version
 import setuptools
 import torch
 import torch.utils.cpp_extension as torch_cpp_ext
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
+from typing import Optional
 
 ROOT_DIR = os.path.dirname(__file__)
 # This is a temporary directory to store third-party packages.
@@ -475,6 +476,12 @@ def get_requirements() -> List[str]:
     return requirements
 
 
+def get_ray_requirement() -> Optional[Dict[str, List[str]]]:
+    if _is_neuron():
+        return None
+    return {"ray": ["ray >= 2.9"]}
+
+
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
@@ -508,6 +515,7 @@ def get_requirements() -> List[str]:
                                                "examples", "tests")),
     python_requires=">=3.8",
     install_requires=get_requirements(),
+    extras_requires=get_ray_requirement(),
     ext_modules=ext_modules,
     cmdclass={"build_ext": build_ext} if not _is_neuron() else {},
     distclass=BinaryDistribution,
 
@@ -0,0 +1,66 @@
+import pytest
+import torch
+import multiprocessing as mp
+from vllm import LLM, SamplingParams
+
+TENSOR_PARALLEL_SIZE = 2
+MAX_GENERATION_TOKENS = 256
+
+
+def llm_generate(result_queue, prompt_token_ids, worker_use_ray=False):
+    try:
+        llm = LLM(model="facebook/opt-350m",
+                  tensor_parallel_size=TENSOR_PARALLEL_SIZE,
+                  worker_use_ray=worker_use_ray)
+
+        output = llm.generate(
+            prompt_token_ids=prompt_token_ids,
+            sampling_params=SamplingParams(max_tokens=MAX_GENERATION_TOKENS))
+    except BaseException as e:
+        output = e
+
+    result_queue.put(output)
+
+
+def run_llm(prompt_token_ids, worker_use_ray=False):
+    result_queue = mp.Queue()
+    proc = mp.Process(target=llm_generate,
+                      args=(result_queue, prompt_token_ids, worker_use_ray))
+    proc.start()
+    result = result_queue.get()
+    proc.join()
+    if isinstance(result, BaseException):
+        raise result
+    return result
+
+
+def get_prompts():
+    # https://github.com/vllm-project/vllm/issues/367#issuecomment-1629872996
+    batch_size = 32
+    dim = 120
+    max_token_id = 32000
+    torch.manual_seed(42)
+    batch = torch.randint(max_token_id, (batch_size, dim))
+    prompt_token_ids = [tokens.tolist() for tokens in batch]
+    return prompt_token_ids
+
+
+@pytest.mark.skip("Requires multiple GPUs")
+def test_local_worker():
+    # Similar to tests/lora/test_llama.py
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < 2:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
+
+    prompt_token_ids = get_prompts()
+    output1 = run_llm(prompt_token_ids, worker_use_ray=False)
+    output2 = run_llm(prompt_token_ids, worker_use_ray=True)
+    assert len(output1) == len(output2)
+
+    completion_token_ids1 = [item.outputs[0].token_ids for item in output1]
+    completion_token_ids2 = [item.outputs[0].token_ids for item in output2]
+    assert completion_token_ids1 == completion_token_ids2
+
+
+if __name__ == "__main__":
+    test_local_worker()
@@ -19,7 +19,7 @@ def _configure_system():
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
 from vllm.engine.async_llm_engine import AsyncLLMEngine  # noqa: E402
 from vllm.engine.llm_engine import LLMEngine  # noqa: E402
-from vllm.engine.ray_utils import initialize_cluster  # noqa: E402
+from vllm.engine.ray_utils import initialize_ray_cluster  # noqa: E402
 from vllm.entrypoints.llm import LLM  # noqa: E402
 from vllm.outputs import CompletionOutput, RequestOutput  # noqa: E402
 from vllm.sampling_params import SamplingParams  # noqa: E402
@@ -35,5 +35,5 @@ def _configure_system():
     "EngineArgs",
     "AsyncLLMEngine",
     "AsyncEngineArgs",
-    "initialize_cluster",
+    "initialize_ray_cluster",
 ]
@@ -1,3 +1,4 @@
+import importlib.util
 from typing import Optional, Union, ClassVar
 from dataclasses import dataclass
 import os
@@ -376,9 +377,9 @@ class ParallelConfig:
     Args:
         pipeline_parallel_size: Number of pipeline parallel groups.
         tensor_parallel_size: Number of tensor parallel groups.
-        worker_use_ray: Whether to use Ray for model workers. Will be set to
+        worker_use_ray: Whether to use Ray for model workers. Will default to
             True if either pipeline_parallel_size or tensor_parallel_size is
-            greater than 1.
+            greater than 1 and Ray is installed.
         max_parallel_loading_workers: Maximum number of multiple batches
             when load model sequentially. To avoid RAM OOM when using tensor
             parallel and large models.
@@ -392,7 +393,7 @@ def __init__(
         self,
         pipeline_parallel_size: int,
         tensor_parallel_size: int,
-        worker_use_ray: bool,
+        worker_use_ray: Optional[bool] = None,
         max_parallel_loading_workers: Optional[int] = None,
         disable_custom_all_reduce: bool = False,
         ray_workers_use_nsight: bool = False,
@@ -412,9 +413,10 @@ def __init__(
         self.ray_workers_use_nsight = ray_workers_use_nsight
 
         self.world_size = pipeline_parallel_size * self.tensor_parallel_size
-        # Ray worker is not supported for Neuron backend.
-        if self.world_size > 1 and not is_neuron():
-            self.worker_use_ray = True
+        if self.worker_use_ray is None:
+            ray_found = importlib.util.find_spec("ray") is not None
+            self.worker_use_ray = ray_found and self.world_size > 1
+
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -498,12 +500,12 @@ class DeviceConfig:
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
             # Automated device type detection
-            if torch.cuda.is_available():
-                self.device_type = "cuda"
-            elif is_neuron():
+            if is_neuron():
                 self.device_type = "neuron"
             else:
-                raise RuntimeError("No supported device detected.")
+                # We don't call torch.cuda.is_available() here to
+                # avoid initializing CUDA before workers are forked
+                self.device_type = "cuda"
         else:
             # Device type is assigned explicitly
             self.device_type = device
 
@@ -20,7 +20,7 @@ class EngineArgs:
     kv_cache_dtype: str = 'auto'
     seed: int = 0
     max_model_len: Optional[int] = None
-    worker_use_ray: bool = False
+    worker_use_ray: Optional[bool] = None
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
@@ -149,10 +149,12 @@ def add_cli_args(
                             help='model context length. If unspecified, '
                             'will be automatically derived from the model.')
         # Parallel arguments
-        parser.add_argument('--worker-use-ray',
-                            action='store_true',
-                            help='use Ray for distributed serving, will be '
-                            'automatically set when using more than 1 GPU')
+        parser.add_argument(
+            '--worker-use-ray',
+            action=argparse.BooleanOptionalAction,
+            default=None,
+            help='use Ray for distributed serving, will default '
+            'to true when ray is installed and more than 1 GPU is used')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
 
@@ -9,7 +9,7 @@
 from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.ray_utils import initialize_cluster, ray
+from vllm.engine.ray_utils import initialize_ray_cluster, ray
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
@@ -287,9 +287,15 @@ async def _run_workers_async(
         coros.append(asyncio.get_event_loop().run_in_executor(
             None, partial(driver_executor, *driver_args, **driver_kwargs)))
 
-        # Run the ray workers asynchronously.
-        for worker in self.workers:
-            coros.append(worker.execute_method.remote(method, *args, **kwargs))
+        # Run the workers asynchronously.
+        if self.parallel_config.worker_use_ray:
+            for worker in self.workers:
+                coros.append(
+                    worker.execute_method.remote(method, *args, **kwargs))
+        else:
+            for worker in self.workers:
+                coros.append(
+                    worker.execute_method_async(method, *args, **kwargs))
 
         all_outputs = await asyncio.gather(*coros)
         return all_outputs
@@ -674,8 +680,8 @@ def from_engine_args(cls,
         engine_configs = engine_args.create_engine_configs()
         parallel_config = engine_configs[2]
         # Initialize the cluster.
-        placement_group = initialize_cluster(parallel_config,
-                                             engine_args.engine_use_ray)
+        placement_group = initialize_ray_cluster(parallel_config,
+                                                 engine_args.engine_use_ray)
         # Create the async LLM engine.
         engine = cls(parallel_config.worker_use_ray,
                      engine_args.engine_use_ray,