@@ -1522,6 +1522,9 @@ def __post_init__(self):
1522
1522
self .ignore_patterns = ["original/**/*" ]
1523
1523
1524
1524
1525
+ DistributedExecutorBackend = Literal ["ray" , "mp" , "uni" , "external_launcher" ]
1526
+
1527
+
1525
1528
@config
1526
1529
@dataclass
1527
1530
class ParallelConfig :
@@ -1563,7 +1566,7 @@ class ParallelConfig:
1563
1566
placement_group : Optional ["PlacementGroup" ] = None
1564
1567
"""ray distributed model workers placement group."""
1565
1568
1566
- distributed_executor_backend : Optional [Union [str ,
1569
+ distributed_executor_backend : Optional [Union [DistributedExecutorBackend ,
1567
1570
type ["ExecutorBase" ]]] = None
1568
1571
"""Backend to use for distributed model
1569
1572
workers, either "ray" or "mp" (multiprocessing). If the product
@@ -1687,7 +1690,7 @@ def __post_init__(self) -> None:
1687
1690
# current node and we aren't in a ray placement group.
1688
1691
1689
1692
from vllm .executor import ray_utils
1690
- backend = "mp"
1693
+ backend : DistributedExecutorBackend = "mp"
1691
1694
ray_found = ray_utils .ray_is_available ()
1692
1695
if current_platform .is_neuron ():
1693
1696
# neuron uses single process to control multiple devices
@@ -1755,92 +1758,124 @@ def _verify_args(self) -> None:
1755
1758
"worker_extension_cls must be a string (qualified class name)." )
1756
1759
1757
1760
1761
+ SchedulerPolicy = Literal ["fcfs" , "priority" ]
1762
+
1763
+
1764
+ @config
1758
1765
@dataclass
1759
1766
class SchedulerConfig :
1760
1767
"""Scheduler configuration."""
1761
1768
1762
- runner_type : str = "generate" # The runner type to launch for the model.
1769
+ runner_type : RunnerType = "generate"
1770
+ """The runner type to launch for the model."""
1763
1771
1764
- # Maximum number of tokens to be processed in a single iteration.
1765
- max_num_batched_tokens : int = field (default = None ) # type: ignore
1772
+ max_num_batched_tokens : int = None # type: ignore
1773
+ """Maximum number of tokens to be processed in a single iteration.
1774
+
1775
+ This config has no static default. If left unspecified by the user, it will
1776
+ be set in `EngineArgs.create_engine_config` based on the usage context."""
1766
1777
1767
- # Maximum number of sequences to be processed in a single iteration.
1768
- max_num_seqs : int = 128
1778
+ max_num_seqs : int = None # type: ignore
1779
+ """Maximum number of sequences to be processed in a single iteration.
1780
+
1781
+ This config has no static default. If left unspecified by the user, it will
1782
+ be set in `EngineArgs.create_engine_config` based on the usage context."""
1769
1783
1770
- # Maximum length of a sequence (including prompt and generated text).
1771
- max_model_len : int = 8192
1784
+ max_model_len : int = None # type: ignore
1785
+ """Maximum length of a sequence (including prompt and generated text). This
1786
+ is primarily set in `ModelConfig` and that value should be manually
1787
+ duplicated here."""
1772
1788
1773
- # Maximum number of sequences that can be partially prefilled concurrently
1774
1789
max_num_partial_prefills : int = 1
1790
+ """For chunked prefill, the maximum number of sequences that can be
1791
+ partially prefilled concurrently."""
1775
1792
1776
- # Maximum number of "very long prompt" sequences that can be prefilled
1777
- # concurrently (long is defined by long_prefill_threshold)
1778
1793
max_long_partial_prefills : int = 1
1794
+ """For chunked prefill, the maximum number of prompts longer than
1795
+ long_prefill_token_threshold that will be prefilled concurrently. Setting
1796
+ this less than max_num_partial_prefills will allow shorter prompts to jump
1797
+ the queue in front of longer prompts in some cases, improving latency."""
1779
1798
1780
- # calculate context length that determines which sequences are
1781
- # considered "long"
1782
1799
long_prefill_token_threshold : int = 0
1800
+ """For chunked prefill, a request is considered long if the prompt is
1801
+ longer than this number of tokens."""
1783
1802
1784
- # The number of slots to allocate per sequence per
1785
- # step, beyond the known token ids. This is used in speculative
1786
- # decoding to store KV activations of tokens which may or may not be
1787
- # accepted.
1788
1803
num_lookahead_slots : int = 0
1804
+ """The number of slots to allocate per sequence per
1805
+ step, beyond the known token ids. This is used in speculative
1806
+ decoding to store KV activations of tokens which may or may not be
1807
+ accepted.
1808
+
1809
+ NOTE: This will be replaced by speculative config in the future; it is
1810
+ present to enable correctness tests until then."""
1789
1811
1790
- # Apply a delay (of delay factor multiplied by previous
1791
- # prompt latency) before scheduling next prompt.
1792
1812
delay_factor : float = 0.0
1813
+ """Apply a delay (of delay factor multiplied by previous
1814
+ prompt latency) before scheduling next prompt."""
1793
1815
1794
- # If True, prefill requests can be chunked based
1795
- # on the remaining max_num_batched_tokens.
1796
- enable_chunked_prefill : bool = False
1816
+ enable_chunked_prefill : bool = None # type: ignore
1817
+ """If True, prefill requests can be chunked based
1818
+ on the remaining max_num_batched_tokens."""
1797
1819
1798
1820
is_multimodal_model : bool = False
1821
+ """True if the model is multimodal."""
1822
+
1823
+ # TODO (ywang96): Make this configurable.
1824
+ max_num_encoder_input_tokens : int = field (init = False )
1825
+ """Multimodal encoder compute budget, only used in V1.
1826
+
1827
+ NOTE: This is not currently configurable. It will be overridden by
1828
+ max_num_batched_tokens in case max multimodal embedding size is larger."""
1829
+
1830
+ # TODO (ywang96): Make this configurable.
1831
+ encoder_cache_size : int = field (init = False )
1832
+ """Multimodal encoder cache size, only used in V1.
1833
+
1834
+ NOTE: This is not currently configurable. It will be overridden by
1835
+ max_num_batched_tokens in case max multimodal embedding size is larger."""
1799
1836
1800
- # NOTE: The following multimodal encoder budget will be initialized to
1801
- # max_num_batched_tokens and overridden in case max multimodal embedding
1802
- # size is larger.
1803
- # TODO (ywang96): Make these configurable.
1804
- # Multimodal encoder compute budget, only used in V1
1805
- max_num_encoder_input_tokens : int = field (default = None ) # type: ignore
1806
-
1807
- # Multimodal encoder cache size, only used in V1
1808
- encoder_cache_size : int = field (default = None ) # type: ignore
1809
-
1810
- # Whether to perform preemption by swapping or
1811
- # recomputation. If not specified, we determine the mode as follows:
1812
- # We use recomputation by default since it incurs lower overhead than
1813
- # swapping. However, when the sequence group has multiple sequences
1814
- # (e.g., beam search), recomputation is not currently supported. In
1815
- # such a case, we use swapping instead.
1816
1837
preemption_mode : Optional [str ] = None
1838
+ """Whether to perform preemption by swapping or
1839
+ recomputation. If not specified, we determine the mode as follows:
1840
+ We use recomputation by default since it incurs lower overhead than
1841
+ swapping. However, when the sequence group has multiple sequences
1842
+ (e.g., beam search), recomputation is not currently supported. In
1843
+ such a case, we use swapping instead."""
1817
1844
1818
1845
num_scheduler_steps : int = 1
1846
+ """Maximum number of forward steps per scheduler call."""
1819
1847
1820
- multi_step_stream_outputs : bool = False
1848
+ multi_step_stream_outputs : bool = True
1849
+ """If False, then multi-step will stream outputs at the end of all steps"""
1821
1850
1822
- # Private API. If used, scheduler sends delta data to
1823
- # workers instead of an entire data. It should be enabled only
1824
- # when SPMD worker architecture is enabled. I.e.,
1825
- # VLLM_USE_RAY_SPMD_WORKER=1
1826
1851
send_delta_data : bool = False
1827
-
1828
- # The scheduling policy to use. "fcfs" (default) or "priority".
1829
- policy : str = "fcfs"
1852
+ """Private API. If used, scheduler sends delta data to
1853
+ workers instead of an entire data. It should be enabled only
1854
+ when SPMD worker architecture is enabled. I.e.,
1855
+ VLLM_USE_RAY_SPMD_WORKER=1"""
1856
+
1857
+ policy : SchedulerPolicy = "fcfs"
1858
+ """The scheduling policy to use:\n
1859
+ - "fcfs" means first come first served, i.e. requests are handled in order
1860
+ of arrival.\n
1861
+ - "priority" means requests are handled based on given priority (lower
1862
+ value means earlier handling) and time of arrival deciding any ties)."""
1830
1863
1831
1864
chunked_prefill_enabled : bool = field (init = False )
1865
+ """True if chunked prefill is enabled."""
1832
1866
1833
- # If set to true and chunked prefill is enabled, we do not want to
1834
- # partially schedule a multimodal item. Only used in V1
1835
- # This ensures that if a request has a mixed prompt
1836
- # (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
1837
- # some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
1838
- # it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
1839
1867
disable_chunked_mm_input : bool = False
1868
+ """If set to true and chunked prefill is enabled, we do not want to
1869
+ partially schedule a multimodal item. Only used in V1
1870
+ This ensures that if a request has a mixed prompt
1871
+ (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
1872
+ some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
1873
+ it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
1840
1874
1841
- # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
1842
- # or "mod.custom_class".
1843
1875
scheduler_cls : Union [str , type [object ]] = "vllm.core.scheduler.Scheduler"
1876
+ """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
1877
+ default scheduler. Can be a class directly or the path to a class of form
1878
+ "mod.custom_class"."""
1844
1879
1845
1880
def compute_hash (self ) -> str :
1846
1881
"""
@@ -1862,6 +1897,18 @@ def compute_hash(self) -> str:
1862
1897
return hash_str
1863
1898
1864
1899
def __post_init__ (self ) -> None :
1900
+ if self .max_model_len is None :
1901
+ self .max_model_len = 8192
1902
+ logger .warning (
1903
+ "max_model_len was is not set. Defaulting to arbitrary value "
1904
+ "of %d." , self .max_model_len )
1905
+
1906
+ if self .max_num_seqs is None :
1907
+ self .max_num_seqs = 128
1908
+ logger .warning (
1909
+ "max_num_seqs was is not set. Defaulting to arbitrary value "
1910
+ "of %d." , self .max_num_seqs )
1911
+
1865
1912
if self .max_num_batched_tokens is None :
1866
1913
if self .enable_chunked_prefill :
1867
1914
if self .num_scheduler_steps > 1 :
0 commit comments