Skip to content

Commit 6c4dbe2

Browse files
authored
[BugFix] Pop instead of del CUDA_VISIBLE_DEVICES (#12962)
Signed-off-by: Hollow Man <[email protected]>
1 parent 21f5d50 commit 6c4dbe2

File tree

4 files changed

+9
-9
lines changed

4 files changed

+9
-9
lines changed

examples/offline_inference/rlhf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def __init__(self, *args, **kwargs):
9292
# a hack to make the script work.
9393
# stop ray from manipulating CUDA_VISIBLE_DEVICES
9494
# at the top-level
95-
del os.environ["CUDA_VISIBLE_DEVICES"]
95+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
9696
super().__init__(*args, **kwargs)
9797

9898

examples/offline_inference/rlhf_colocate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def __init__(self, *args, bundle_indices: list, **kwargs):
5959
# a hack to make the script work.
6060
# stop ray from manipulating CUDA_VISIBLE_DEVICES
6161
# at the top-level
62-
del os.environ["CUDA_VISIBLE_DEVICES"]
62+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
6363
# every worker will use 0.4 GPU, so that we can schedule
6464
# 2 instances on the same GPUs.
6565
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"

tests/distributed/test_comm_ops.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
2222
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
2323
# so that each worker can see all the GPUs
2424
# they will be able to set the device to the correct GPU
25-
del os.environ["CUDA_VISIBLE_DEVICES"]
25+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
2626
device = torch.device(f"cuda:{rank}")
2727
torch.cuda.set_device(device)
2828
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
4444
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
4545
# so that each worker can see all the GPUs
4646
# they will be able to set the device to the correct GPU
47-
del os.environ["CUDA_VISIBLE_DEVICES"]
47+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
4848
device = torch.device(f"cuda:{rank}")
4949
torch.cuda.set_device(device)
5050
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
7272
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
7373
# so that each worker can see all the GPUs
7474
# they will be able to set the device to the correct GPU
75-
del os.environ["CUDA_VISIBLE_DEVICES"]
75+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
7676
device = torch.device(f"cuda:{rank}")
7777
torch.cuda.set_device(device)
7878
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
108108
@ray.remote(num_gpus=1, max_calls=1)
109109
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
110110
distributed_init_port: str):
111-
del os.environ["CUDA_VISIBLE_DEVICES"]
111+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
112112
device = torch.device(f"cuda:{rank}")
113113
torch.cuda.set_device(device)
114114
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
148148
@ray.remote(num_gpus=1, max_calls=1)
149149
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
150150
distributed_init_port: str):
151-
del os.environ["CUDA_VISIBLE_DEVICES"]
151+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
152152
device = torch.device(f"cuda:{rank}")
153153
torch.cuda.set_device(device)
154154
init_test_distributed_environment(tp_size, pp_size, rank,

tests/distributed/test_custom_all_reduce.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
@ray.remote(num_gpus=1, max_calls=1)
2626
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
27-
del os.environ["CUDA_VISIBLE_DEVICES"]
27+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
2828
device = torch.device(f"cuda:{rank}")
2929
torch.cuda.set_device(device)
3030
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
8080

8181
@ray.remote(num_gpus=1, max_calls=1)
8282
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
83-
del os.environ["CUDA_VISIBLE_DEVICES"]
83+
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
8484
device = torch.device(f"cuda:{rank}")
8585
torch.cuda.set_device(device)
8686
init_test_distributed_environment(tp_size, pp_size, rank,

0 commit comments

Comments
 (0)