File tree 3 files changed +26
-2
lines changed 3 files changed +26
-2
lines changed Original file line number Diff line number Diff line change 21
21
from vllm .logger import init_logger
22
22
from vllm .sampling_params import SamplingParams
23
23
from vllm .usage .usage_lib import UsageContext
24
- from vllm .utils import FlexibleArgumentParser , random_uuid
24
+ from vllm .utils import FlexibleArgumentParser , random_uuid , set_ulimit
25
25
from vllm .version import __version__ as VLLM_VERSION
26
26
27
27
logger = init_logger ("vllm.entrypoints.api_server" )
@@ -119,6 +119,8 @@ async def run_server(args: Namespace,
119
119
logger .info ("vLLM API server version %s" , VLLM_VERSION )
120
120
logger .info ("args: %s" , args )
121
121
122
+ set_ulimit ()
123
+
122
124
app = await init_app (args , llm_engine )
123
125
assert engine is not None
124
126
Original file line number Diff line number Diff line change 68
68
from vllm .logger import init_logger
69
69
from vllm .usage .usage_lib import UsageContext
70
70
from vllm .utils import (FlexibleArgumentParser , get_open_zmq_ipc_path ,
71
- is_valid_ipv6_address )
71
+ is_valid_ipv6_address , set_ulimit )
72
72
from vllm .version import __version__ as VLLM_VERSION
73
73
74
74
TIMEOUT_KEEP_ALIVE = 5 # seconds
@@ -727,6 +727,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
727
727
sock_addr = (args .host or "" , args .port )
728
728
sock = create_server_socket (sock_addr )
729
729
730
+ # workaround to avoid footguns where uvicorn drops requests with too
731
+ # many concurrent requests active
732
+ set_ulimit ()
733
+
730
734
def signal_handler (* _ ) -> None :
731
735
# Interrupt server on sigterm while initializing
732
736
raise KeyboardInterrupt ("terminated" )
Original file line number Diff line number Diff line change 12
12
import ipaddress
13
13
import os
14
14
import re
15
+ import resource
15
16
import signal
16
17
import socket
17
18
import subprocess
@@ -1818,3 +1819,20 @@ def memory_profiling(
1818
1819
result .non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff .torch_memory_in_bytes # noqa
1819
1820
result .profile_time = diff .timestamp
1820
1821
result .non_kv_cache_memory_in_bytes = result .non_torch_increase_in_bytes + result .torch_peak_increase_in_bytes + result .weights_memory_in_bytes # noqa
1822
+
1823
+
1824
+ # Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre
1825
+ def set_ulimit (target_soft_limit = 65535 ):
1826
+ resource_type = resource .RLIMIT_NOFILE
1827
+ current_soft , current_hard = resource .getrlimit (resource_type )
1828
+
1829
+ if current_soft < target_soft_limit :
1830
+ try :
1831
+ resource .setrlimit (resource_type ,
1832
+ (target_soft_limit , current_hard ))
1833
+ except ValueError as e :
1834
+ logger .warning (
1835
+ "Found ulimit of %s and failed to automatically increase"
1836
+ "with error %s. This can cause fd limit errors like"
1837
+ "`OSError: [Errno 24] Too many open files`. Consider "
1838
+ "increasing with ulimit -n" , current_soft , e )
You can’t perform that action at this time.
0 commit comments