Skip to content

Commit c948d32

Browse files
jglaserliuzijing2014
authored andcommitted
Use custom address for listening socket (vllm-project#15988)
Signed-off-by: Jens Glaser <[email protected]>
1 parent a6bfdea commit c948d32

File tree

1 file changed

+21
-1
lines changed

1 file changed

+21
-1
lines changed

vllm/distributed/utils.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import dataclasses
88
import datetime
99
import pickle
10+
import socket
1011
import time
1112
from collections import deque
1213
from typing import Any, Deque, Dict, Optional, Sequence, Tuple
@@ -123,6 +124,10 @@ class StatelessProcessGroup:
123124
rank: int
124125
world_size: int
125126
store: torch._C._distributed_c10d.Store
127+
128+
# stores a reference to the socket so that the file descriptor stays alive
129+
socket: Optional[socket.socket]
130+
126131
data_expiration_seconds: int = 3600 # 1 hour
127132

128133
# dst rank -> counter
@@ -234,18 +239,33 @@ def create(
234239
can call `StatelessProcessGroup.create` to form a group, and then process A, B,
235240
C, and D can call `StatelessProcessGroup.create` to form another group.
236241
""" # noqa
242+
launch_server = rank == 0
243+
if launch_server:
244+
# listen on the specified interface (instead of 0.0.0.0)
245+
listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
246+
listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
247+
listen_socket.bind((host, port))
248+
listen_socket.listen()
249+
listen_fd = listen_socket.fileno()
250+
else:
251+
listen_socket = None
252+
listen_fd = None
253+
237254
store = TCPStore(
238255
host_name=host,
239256
port=port,
240257
world_size=world_size,
241-
is_master=(rank == 0),
258+
is_master=launch_server,
242259
timeout=datetime.timedelta(seconds=store_timeout),
260+
use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215
261+
master_listen_fd=listen_fd,
243262
)
244263

245264
return StatelessProcessGroup(
246265
rank=rank,
247266
world_size=world_size,
248267
store=store,
268+
socket=listen_socket,
249269
data_expiration_seconds=data_expiration_seconds)
250270

251271

0 commit comments

Comments
 (0)