1
1
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
2
+ import os
3
+
4
+ import torch
2
5
3
6
from vllm .engine .arg_utils import AsyncEngineArgs , EngineArgs
4
7
from vllm .engine .async_llm_engine import AsyncLLMEngine
17
20
18
21
from .version import __version__ , __version_tuple__
19
22
23
+ # set some common config/environment variables that should be set
24
+ # for all processes created by vllm and all processes
25
+ # that interact with vllm workers.
26
+ # they are executed whenever `import vllm` is called.
20
27
21
- def configure_as_vllm_process ():
22
- """
23
- set some common config/environment variables that should be set
24
- for all processes created by vllm and all processes
25
- that interact with vllm workers.
26
- """
27
- import os
28
-
29
- import torch
30
-
31
- # see https://github.com/NVIDIA/nccl/issues/1234
32
- os .environ ['NCCL_CUMEM_ENABLE' ] = '0'
33
-
34
- # see https://github.com/vllm-project/vllm/issues/10480
35
- os .environ ['TORCHINDUCTOR_COMPILE_THREADS' ] = '1'
36
- # see https://github.com/vllm-project/vllm/issues/10619
37
- torch ._inductor .config .compile_threads = 1
38
-
39
- from vllm .platforms import current_platform
40
-
41
- if current_platform .is_xpu ():
42
- # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
43
- torch ._dynamo .config .disable = True
44
- elif current_platform .is_hpu ():
45
- # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
46
- # does not support torch.compile
47
- # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
48
- # torch.compile support
49
- is_lazy = os .environ .get ('PT_HPU_LAZY_MODE' , '1' ) == '1'
50
- if is_lazy :
51
- torch ._dynamo .config .disable = True
52
- # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
53
- # requires enabling lazy collectives
54
- # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
55
- os .environ ['PT_HPU_ENABLE_LAZY_COLLECTIVES' ] = 'true'
28
+ # see https://github.com/NVIDIA/nccl/issues/1234
29
+ os .environ ['NCCL_CUMEM_ENABLE' ] = '0'
56
30
31
+ # see https://github.com/vllm-project/vllm/issues/10480
32
+ os .environ ['TORCHINDUCTOR_COMPILE_THREADS' ] = '1'
33
+ # see https://github.com/vllm-project/vllm/issues/10619
34
+ torch ._inductor .config .compile_threads = 1
57
35
58
36
__all__ = [
59
37
"__version__" ,
@@ -80,5 +58,4 @@ def configure_as_vllm_process():
80
58
"AsyncEngineArgs" ,
81
59
"initialize_ray_cluster" ,
82
60
"PoolingParams" ,
83
- "configure_as_vllm_process" ,
84
61
]
0 commit comments