1
1
import asyncio
2
+ import os
2
3
from typing import Any , Callable , List , Optional , Union
3
4
4
5
import cloudpickle
10
11
from vllm .logger import init_logger
11
12
from vllm .model_executor .layers .sampler import SamplerOutput
12
13
from vllm .sequence import ExecuteModelRequest
13
- from vllm .utils import (_run_task_with_lock , get_distributed_init_method ,
14
- get_ip , get_open_port , make_async , run_method )
14
+ from vllm .utils import (_run_task_with_lock , cuda_device_count_stateless ,
15
+ get_distributed_init_method , get_ip , get_open_port ,
16
+ make_async , run_method , update_environment_variables )
15
17
from vllm .worker .worker_base import WorkerWrapperBase
16
18
17
19
logger = init_logger (__name__ )
@@ -22,7 +24,39 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
22
24
23
25
uses_ray : bool = False
24
26
27
+ def _check_cuda (self ) -> None :
28
+ """Check that the number of GPUs is sufficient for the parallel
29
+ configuration. Separate from _init_executor to reduce the number of
30
+ indented blocks.
31
+ """
32
+ parallel_config = self .parallel_config
33
+ world_size = parallel_config .world_size
34
+ tensor_parallel_size = parallel_config .tensor_parallel_size
35
+
36
+ cuda_device_count = cuda_device_count_stateless ()
37
+ # Use confusing message for more common TP-only case.
38
+ if tensor_parallel_size > cuda_device_count :
39
+ raise RuntimeError (
40
+ f"please set tensor_parallel_size ({ tensor_parallel_size } ) "
41
+ f"to less than max local gpu count ({ cuda_device_count } )" )
42
+
43
+ if world_size > cuda_device_count :
44
+ raise RuntimeError (
45
+ f"please ensure that world_size ({ world_size } ) "
46
+ f"is less than than max local gpu count ({ cuda_device_count } )" )
47
+
48
+ # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
49
+ if "CUDA_VISIBLE_DEVICES" not in os .environ :
50
+ update_environment_variables ({
51
+ "CUDA_VISIBLE_DEVICES" : ("," .join (map (str , range (world_size ))))
52
+ })
53
+
25
54
def _init_executor (self ) -> None :
55
+
56
+ from vllm .platforms import current_platform
57
+ if current_platform .is_cuda_alike ():
58
+ self ._check_cuda ()
59
+
26
60
# Create the parallel GPU workers.
27
61
world_size = self .parallel_config .world_size
28
62
tensor_parallel_size = self .parallel_config .tensor_parallel_size
0 commit comments