Skip to content

Commit e51929e

Browse files
authored
Improve configs - SchedulerConfig (#16533)
Signed-off-by: Harry Mellor <[email protected]>
1 parent dc1b4a6 commit e51929e

File tree

4 files changed

+283
-222
lines changed

4 files changed

+283
-222
lines changed

vllm/config.py

Lines changed: 102 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,6 +1522,9 @@ def __post_init__(self):
15221522
self.ignore_patterns = ["original/**/*"]
15231523

15241524

1525+
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
1526+
1527+
15251528
@config
15261529
@dataclass
15271530
class ParallelConfig:
@@ -1563,7 +1566,7 @@ class ParallelConfig:
15631566
placement_group: Optional["PlacementGroup"] = None
15641567
"""ray distributed model workers placement group."""
15651568

1566-
distributed_executor_backend: Optional[Union[str,
1569+
distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
15671570
type["ExecutorBase"]]] = None
15681571
"""Backend to use for distributed model
15691572
workers, either "ray" or "mp" (multiprocessing). If the product
@@ -1687,7 +1690,7 @@ def __post_init__(self) -> None:
16871690
# current node and we aren't in a ray placement group.
16881691

16891692
from vllm.executor import ray_utils
1690-
backend = "mp"
1693+
backend: DistributedExecutorBackend = "mp"
16911694
ray_found = ray_utils.ray_is_available()
16921695
if current_platform.is_neuron():
16931696
# neuron uses single process to control multiple devices
@@ -1755,92 +1758,124 @@ def _verify_args(self) -> None:
17551758
"worker_extension_cls must be a string (qualified class name).")
17561759

17571760

1761+
SchedulerPolicy = Literal["fcfs", "priority"]
1762+
1763+
1764+
@config
17581765
@dataclass
17591766
class SchedulerConfig:
17601767
"""Scheduler configuration."""
17611768

1762-
runner_type: str = "generate" # The runner type to launch for the model.
1769+
runner_type: RunnerType = "generate"
1770+
"""The runner type to launch for the model."""
17631771

1764-
# Maximum number of tokens to be processed in a single iteration.
1765-
max_num_batched_tokens: int = field(default=None) # type: ignore
1772+
max_num_batched_tokens: int = None # type: ignore
1773+
"""Maximum number of tokens to be processed in a single iteration.
1774+
1775+
This config has no static default. If left unspecified by the user, it will
1776+
be set in `EngineArgs.create_engine_config` based on the usage context."""
17661777

1767-
# Maximum number of sequences to be processed in a single iteration.
1768-
max_num_seqs: int = 128
1778+
max_num_seqs: int = None # type: ignore
1779+
"""Maximum number of sequences to be processed in a single iteration.
1780+
1781+
This config has no static default. If left unspecified by the user, it will
1782+
be set in `EngineArgs.create_engine_config` based on the usage context."""
17691783

1770-
# Maximum length of a sequence (including prompt and generated text).
1771-
max_model_len: int = 8192
1784+
max_model_len: int = None # type: ignore
1785+
"""Maximum length of a sequence (including prompt and generated text). This
1786+
is primarily set in `ModelConfig` and that value should be manually
1787+
duplicated here."""
17721788

1773-
# Maximum number of sequences that can be partially prefilled concurrently
17741789
max_num_partial_prefills: int = 1
1790+
"""For chunked prefill, the maximum number of sequences that can be
1791+
partially prefilled concurrently."""
17751792

1776-
# Maximum number of "very long prompt" sequences that can be prefilled
1777-
# concurrently (long is defined by long_prefill_threshold)
17781793
max_long_partial_prefills: int = 1
1794+
"""For chunked prefill, the maximum number of prompts longer than
1795+
long_prefill_token_threshold that will be prefilled concurrently. Setting
1796+
this less than max_num_partial_prefills will allow shorter prompts to jump
1797+
the queue in front of longer prompts in some cases, improving latency."""
17791798

1780-
# calculate context length that determines which sequences are
1781-
# considered "long"
17821799
long_prefill_token_threshold: int = 0
1800+
"""For chunked prefill, a request is considered long if the prompt is
1801+
longer than this number of tokens."""
17831802

1784-
# The number of slots to allocate per sequence per
1785-
# step, beyond the known token ids. This is used in speculative
1786-
# decoding to store KV activations of tokens which may or may not be
1787-
# accepted.
17881803
num_lookahead_slots: int = 0
1804+
"""The number of slots to allocate per sequence per
1805+
step, beyond the known token ids. This is used in speculative
1806+
decoding to store KV activations of tokens which may or may not be
1807+
accepted.
1808+
1809+
NOTE: This will be replaced by speculative config in the future; it is
1810+
present to enable correctness tests until then."""
17891811

1790-
# Apply a delay (of delay factor multiplied by previous
1791-
# prompt latency) before scheduling next prompt.
17921812
delay_factor: float = 0.0
1813+
"""Apply a delay (of delay factor multiplied by previous
1814+
prompt latency) before scheduling next prompt."""
17931815

1794-
# If True, prefill requests can be chunked based
1795-
# on the remaining max_num_batched_tokens.
1796-
enable_chunked_prefill: bool = False
1816+
enable_chunked_prefill: bool = None # type: ignore
1817+
"""If True, prefill requests can be chunked based
1818+
on the remaining max_num_batched_tokens."""
17971819

17981820
is_multimodal_model: bool = False
1821+
"""True if the model is multimodal."""
1822+
1823+
# TODO (ywang96): Make this configurable.
1824+
max_num_encoder_input_tokens: int = field(init=False)
1825+
"""Multimodal encoder compute budget, only used in V1.
1826+
1827+
NOTE: This is not currently configurable. It will be overridden by
1828+
max_num_batched_tokens in case max multimodal embedding size is larger."""
1829+
1830+
# TODO (ywang96): Make this configurable.
1831+
encoder_cache_size: int = field(init=False)
1832+
"""Multimodal encoder cache size, only used in V1.
1833+
1834+
NOTE: This is not currently configurable. It will be overridden by
1835+
max_num_batched_tokens in case max multimodal embedding size is larger."""
17991836

1800-
# NOTE: The following multimodal encoder budget will be initialized to
1801-
# max_num_batched_tokens and overridden in case max multimodal embedding
1802-
# size is larger.
1803-
# TODO (ywang96): Make these configurable.
1804-
# Multimodal encoder compute budget, only used in V1
1805-
max_num_encoder_input_tokens: int = field(default=None) # type: ignore
1806-
1807-
# Multimodal encoder cache size, only used in V1
1808-
encoder_cache_size: int = field(default=None) # type: ignore
1809-
1810-
# Whether to perform preemption by swapping or
1811-
# recomputation. If not specified, we determine the mode as follows:
1812-
# We use recomputation by default since it incurs lower overhead than
1813-
# swapping. However, when the sequence group has multiple sequences
1814-
# (e.g., beam search), recomputation is not currently supported. In
1815-
# such a case, we use swapping instead.
18161837
preemption_mode: Optional[str] = None
1838+
"""Whether to perform preemption by swapping or
1839+
recomputation. If not specified, we determine the mode as follows:
1840+
We use recomputation by default since it incurs lower overhead than
1841+
swapping. However, when the sequence group has multiple sequences
1842+
(e.g., beam search), recomputation is not currently supported. In
1843+
such a case, we use swapping instead."""
18171844

18181845
num_scheduler_steps: int = 1
1846+
"""Maximum number of forward steps per scheduler call."""
18191847

1820-
multi_step_stream_outputs: bool = False
1848+
multi_step_stream_outputs: bool = True
1849+
"""If False, then multi-step will stream outputs at the end of all steps"""
18211850

1822-
# Private API. If used, scheduler sends delta data to
1823-
# workers instead of an entire data. It should be enabled only
1824-
# when SPMD worker architecture is enabled. I.e.,
1825-
# VLLM_USE_RAY_SPMD_WORKER=1
18261851
send_delta_data: bool = False
1827-
1828-
# The scheduling policy to use. "fcfs" (default) or "priority".
1829-
policy: str = "fcfs"
1852+
"""Private API. If used, scheduler sends delta data to
1853+
workers instead of an entire data. It should be enabled only
1854+
when SPMD worker architecture is enabled. I.e.,
1855+
VLLM_USE_RAY_SPMD_WORKER=1"""
1856+
1857+
policy: SchedulerPolicy = "fcfs"
1858+
"""The scheduling policy to use:\n
1859+
- "fcfs" means first come first served, i.e. requests are handled in order
1860+
of arrival.\n
1861+
- "priority" means requests are handled based on given priority (lower
1862+
value means earlier handling) and time of arrival deciding any ties)."""
18301863

18311864
chunked_prefill_enabled: bool = field(init=False)
1865+
"""True if chunked prefill is enabled."""
18321866

1833-
# If set to true and chunked prefill is enabled, we do not want to
1834-
# partially schedule a multimodal item. Only used in V1
1835-
# This ensures that if a request has a mixed prompt
1836-
# (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
1837-
# some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
1838-
# it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
18391867
disable_chunked_mm_input: bool = False
1868+
"""If set to true and chunked prefill is enabled, we do not want to
1869+
partially schedule a multimodal item. Only used in V1
1870+
This ensures that if a request has a mixed prompt
1871+
(like text tokens TTTT followed by image tokens IIIIIIIIII) where only
1872+
some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
1873+
it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
18401874

1841-
# scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
1842-
# or "mod.custom_class".
18431875
scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
1876+
"""The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
1877+
default scheduler. Can be a class directly or the path to a class of form
1878+
"mod.custom_class"."""
18441879

18451880
def compute_hash(self) -> str:
18461881
"""
@@ -1862,6 +1897,18 @@ def compute_hash(self) -> str:
18621897
return hash_str
18631898

18641899
def __post_init__(self) -> None:
1900+
if self.max_model_len is None:
1901+
self.max_model_len = 8192
1902+
logger.warning(
1903+
"max_model_len was is not set. Defaulting to arbitrary value "
1904+
"of %d.", self.max_model_len)
1905+
1906+
if self.max_num_seqs is None:
1907+
self.max_num_seqs = 128
1908+
logger.warning(
1909+
"max_num_seqs was is not set. Defaulting to arbitrary value "
1910+
"of %d.", self.max_num_seqs)
1911+
18651912
if self.max_num_batched_tokens is None:
18661913
if self.enable_chunked_prefill:
18671914
if self.num_scheduler_steps > 1:

0 commit comments

Comments
 (0)