vllm-project
diff --git a/‎vllm/benchmarks/datasets.py
Lines changed: 12 additions & 10 deletions b/‎vllm/benchmarks/datasets.py
Lines changed: 12 additions & 10 deletions
diff --git a/‎vllm/config.py
Lines changed: 1 addition & 1 deletion b/‎vllm/config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/distributed/kv_transfer/kv_connector/v1/base.py
Lines changed: 5 additions & 4 deletions b/‎vllm/distributed/kv_transfer/kv_connector/v1/base.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
Lines changed: 20 additions & 20 deletions b/‎vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
Lines changed: 20 additions & 20 deletions
diff --git a/‎vllm/engine/llm_engine.py
Lines changed: 15 additions & 25 deletions b/‎vllm/engine/llm_engine.py
Lines changed: 15 additions & 25 deletions
@@ -129,16 +129,17 @@ def get_random_lora_request(
 
         Args:
             tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
-            LoRA is selected.  max_loras (Optional[int]): The maximum number of
-            LoRAs available. If None, LoRA is not used.  lora_path
-            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
-            is not used.
+                LoRA is selected.
+            max_loras (Optional[int]): The maximum number of LoRAs available.
+                If `None`, LoRA is not used.
+            lora_path (Optional[str]): Path to the LoRA parameters on disk.
+                If `None`, LoRA is not used.
 
         Returns:
-            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
-            element is a LoRARequest (or None if not applicable) and the second
-            element is the tokenizer associated with the LoRA request (or the
-            base tokenizer).
+            A tuple with the following elements:
+                - A new [LoRARequest][] (or `None` if not applicable).
+                - The tokenizer associated with the LoRA request
+                  (or the base tokenizer).
         """
         if max_loras is None or lora_path is None:
             return None, tokenizer
@@ -167,7 +168,7 @@ def sample(self, tokenizer: PreTrainedTokenizerBase,
 
         Args:
             tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
-             for processing the dataset's text.
+                for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
 
         Returns:
@@ -184,7 +185,8 @@ def maybe_oversample_requests(self, requests: list[SampleRequest],
 
         Args:
             requests (List[SampleRequest]): The current list of sampled
-            requests.  num_requests (int): The target number of requests.
+                requests.
+            num_requests (int): The target number of requests.
         """
         if len(requests) < num_requests:
             random.seed(self.random_seed)
 
@@ -4552,7 +4552,7 @@ def contains_object_print(text):
         text (str): The text to check
 
     Returns:
-        bool: True if a match is found, False otherwise
+        result (bool): `True` if a match is found, `False` otherwise.
     """
     pattern = r'at 0x[a-fA-F0-9]{2,16}>'
     match = re.search(pattern, text)
 
@@ -210,10 +210,11 @@ def get_num_new_matched_tokens(
                 computed tokens for this request
 
         Returns:
-            * the number of tokens that can be loaded from the 
-              external KV cache beyond what is already computed.
-            * true if external KV cache tokens will be loaded
-              asynchronously (between scheduler steps).
+            A tuple with the following elements:
+                - The number of tokens that can be loaded from the 
+                  external KV cache beyond what is already computed.
+                - `True` if external KV cache tokens will be loaded
+                  asynchronously (between scheduler steps).
         """
         pass
 
 
@@ -118,11 +118,11 @@ def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata:
         """
         Create the metadata as a dictionary based on the input tensor.
 
-        Parameters:
-            - tensor: The input tensor or None if no tensor is provided.
+        Args:
+            tensor: The input tensor or None if no tensor is provided.
 
         Returns:
-            - metadata: A dictionary with the following keys:
+            metadata: A dictionary with the following keys:
                 - "dtype": The data type of the tensor or None.
                 - "shape": The shape of the tensor or None.
         """
@@ -135,13 +135,13 @@ def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
         """
         Create a buffer to receive the tensor based on the provided metadata.
 
-        Parameters:
-            - metadata: A dictionary with keys "dtype" and "shape", describing
-              the tensor's data type and shape.
+        Args:
+            metadata: A dictionary with keys "dtype" and "shape",
+                describing the tensor's data type and shape.
 
         Returns:
-            - buffer: A tensor of the specified type and shape, allocated on
-              self.device.
+            buffer: A tensor of the specified type and shape,
+                allocated on `self.device`.
         """
         return torch.empty(metadata["shape"],
                            dtype=metadata["dtype"],
@@ -151,8 +151,8 @@ def _send_metadata(self, metadata: Metadata):
         """
         Send the metadata dictionary to the target rank.
 
-        Parameters:
-            - metadata: A dictionary with keys "dtype" and "shape".
+        Args:
+            metadata: A dictionary with keys "dtype" and "shape".
         """
         self.group.send_obj(metadata, self.target_rank_for_send)
 
@@ -161,8 +161,8 @@ def _recv_metadata(self) -> Metadata:
         Receive the metadata dictionary from the target rank.
 
         Returns:
-            - metadata: A dictionary with keys "dtype" and "shape" describing
-              the tensor.
+            metadata: A dictionary with keys "dtype" and "shape"
+                describing the tensor.
         """
         return self.group.recv_obj(self.target_rank_for_recv)
 
@@ -171,9 +171,9 @@ def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
         The actual implementation of sending the tensor and its metadata to the
         target rank.
 
-        Parameters:
-            - tensor: The input tensor to be sent, or None if no tensor is
-              being sent.
+        Args:
+            tensor: The input tensor to be sent, or `None` if no tensor is
+                being sent.
         """
         metadata = self._make_metadata(tensor)
         self._send_metadata(metadata)
@@ -187,7 +187,7 @@ def _recv_impl(self) -> Optional[torch.Tensor]:
         the target rank.
 
         Returns:
-            - buffer: The received tensor, or None if no tensor is received.
+            buffer: The received tensor, or `None` if no tensor is received.
         """
         metadata = self._recv_metadata()
         if metadata["dtype"] is None:
@@ -227,8 +227,8 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         Sends a tensor and its metadata to the destination rank in a
         non-blocking way.
 
-        Parameters:
-            - tensor: The tensor to send, or None if no tensor is being sent.
+        Args:
+            tensor: The tensor to send, or `None` if no tensor is being sent.
         """
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
@@ -250,8 +250,8 @@ def recv_tensor(self) -> Optional[torch.Tensor]:
         """
         Receives a tensor and its metadata from the source rank. Blocking call.
 
-        Returns:
-            - tensor: The received tensor, or None if no tensor is received.
+        Args:
+            tensor: The received tensor, or `None` if no tensor is received.
         """
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
 
@@ -130,26 +130,16 @@ class LLMEngine:
     iteration-level scheduling and efficient memory management to maximize the
     serving throughput.
 
-    The {class}`~vllm.LLM` class wraps this class for offline batched inference
-    and the {class}`AsyncLLMEngine` class wraps this class for online serving.
+    The [LLM][vllm.LLM] class wraps this class for offline batched inference
+    and the [AsyncLLMEngine][] class wraps this class for online serving.
 
-    The config arguments are derived from {class}`~vllm.EngineArgs`. (See
-    {ref}`engine-args`)
+    The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See
+    [engine-args][])
 
     Args:
-        model_config: The configuration related to the LLM model.
-        cache_config: The configuration related to the KV cache memory
-            management.
-        parallel_config: The configuration related to distributed execution.
-        scheduler_config: The configuration related to the request scheduler.
-        device_config: The configuration related to the device.
-        lora_config (Optional): The configuration related to serving multi-LoRA.
-        speculative_config (Optional): The configuration related to speculative
-            decoding.
+        vllm_config: The configuration for initializing and running vLLM.
         executor_class: The model executor class for managing distributed
             execution.
-        prompt_adapter_config (Optional): The configuration related to serving
-            prompt adapters.
         log_stats: Whether to log statistics.
         usage_context: Specified entry point, used for usage info collection.
     """
@@ -695,11 +685,12 @@ def add_request(
 
         Args:
             request_id: The unique ID of the request.
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See
+                [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each input.
             params: Parameters for sampling or pooling.
-                {class}`~vllm.SamplingParams` for text generation.
-                {class}`~vllm.PoolingParams` for pooling.
+                [SamplingParams][vllm.SamplingParams] for text generation.
+                [PoolingParams][vllm.PoolingParams] for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
             lora_request: The LoRA request to add.
@@ -711,10 +702,11 @@ def add_request(
         Details:
             - Set arrival_time to the current time if it is None.
             - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of {class}`~vllm.Sequence` objects.
-            - Create a {class}`~vllm.SequenceGroup` object
-              from the list of {class}`~vllm.Sequence`.
-            - Add the {class}`~vllm.SequenceGroup` object to the scheduler.
+            - Create `n` number of [Sequence][vllm.Sequence] objects.
+            - Create a [SequenceGroup][vllm.SequenceGroup] object
+              from the list of [Sequence][vllm.Sequence].
+            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
+              scheduler.
 
         Example:
             >>> # initialize engine
@@ -861,9 +853,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
             request_id: The ID(s) of the request to abort.
 
         Details:
-            - Refer to the
-              {meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
-              from class {class}`~vllm.core.scheduler.Scheduler`.
+            - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].
 
         Example:
             >>> # initialize engine and add a request with request_id