File tree 1 file changed +15
-2
lines changed
1 file changed +15
-2
lines changed Original file line number Diff line number Diff line change 8
8
# that interact with vllm workers.
9
9
# they are executed whenever `import vllm` is called.
10
10
11
- # see https://github.com/NVIDIA/nccl/issues/1234
12
- os .environ ['NCCL_CUMEM_ENABLE' ] = '0'
11
+ if not os .path .exists ('/dev/nvidia-caps-imex-channels' ):
12
+ # normally, we disable NCCL_CUMEM_ENABLE because it
13
+ # will cost 1~2 GiB GPU memory with cudagraph+allreduce,
14
+ # see https://github.com/NVIDIA/nccl/issues/1234
15
+ # for more details.
16
+ # However, NCCL requires NCCL_CUMEM_ENABLE to work with
17
+ # multi-node NVLink, typically on GB200-NVL72 systems.
18
+ # The ultimate way to detect multi-node NVLink is to use
19
+ # NVML APIs, which are too expensive to call here.
20
+ # As an approximation, we check the existence of
21
+ # /dev/nvidia-caps-imex-channels, used by
22
+ # multi-node NVLink to communicate across nodes.
23
+ # This will still cost some GPU memory, but it is worthwhile
24
+ # because we can get very fast cross-node bandwidth with NVLink.
25
+ os .environ ['NCCL_CUMEM_ENABLE' ] = '0'
13
26
14
27
# see https://github.com/vllm-project/vllm/pull/15951
15
28
# it avoids unintentional cuda initialization from torch.cuda.is_available()
You can’t perform that action at this time.
0 commit comments