Add option to disable duplicates in topk (vllm-project#464)

kdamaszk · web-flow · commit 41dddabb11f3 · 2024-11-08T12:51:14.000+01:00
Current implementation of optimized topp/topk calculations for scalar
case is handling the duplicates that are outside of kth border.
Unfortunately, to analyze duplicates it is necessary to make a
synchronization with CPU, what makes multi-step scheduling useless
together with topp/topk.

This PR adds option to skip duplicates handling with
`VLLM_HANDLE_TOPK_DUPLICATES` (default `True`). When this variable is
set, handling duplicates will be skipped and we will avoid
synchronization with CPU. It also removes the synchronization which was
done earlier in Sampler, by saving scalar value of `top_k` and `top_p`.
It should give performance gain for all benchmarks with these sampling
parameters, especially together with multi-step scheduling.

While disabling the duplicates handling may cause small accuracy
differences, the best solution will be to handle duplicates without
synchronization with CPU. However, this is not a trivial problem, so I
will try to provide such solution later.
diff --git a/README_GAUDI.md b/README_GAUDI.md
@@ -277,6 +277,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
       - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
       - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
       - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
+-  ``VLLM_HANDLE_TOPK_DUPLICATES``: if ``true``, will handle duplicates that are outside of top-k, ``false`` by default
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
 
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
@@ -378,7 +378,7 @@ Environment variables
          - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size``
          - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size``
          - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
-
+-  ``VLLM_HANDLE_TOPK_DUPLICATES``: if ``true``, will handle duplicates that are outside of top-k, ``false`` by default
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:  
 
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
@@ -1,6 +1,7 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
 import math
+import os
 import warnings
 from dataclasses import dataclass
 from importlib.util import find_spec
@@ -195,19 +196,16 @@ def _init_sampling_tensors(
         self._sampling_tensors = None
 
         # Initialize new sampling tensors
-        (sampling_tensors, do_penalties, do_top_p_top_k,
-         do_min_p) = SamplingTensors.from_sampling_metadata(
+        (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
+         top_k_scalar, top_p_scalar) = SamplingTensors.from_sampling_metadata(
              sampling_metadata, vocab_size, logits.device, logits.dtype)
 
         self._sampling_tensors = sampling_tensors
         self._do_penalties = do_penalties
         self._do_top_p_top_k = do_top_p_top_k
         self._do_min_p = do_min_p
-        self._top_p_scalar = sampling_tensors.top_ps[0]
-        self._top_k_scalar = sampling_tensors.top_ks[0]
-        scalar_p = torch.all(sampling_tensors.top_ps == self._top_p_scalar)
-        scalar_k = torch.all(sampling_tensors.top_ks == self._top_k_scalar)
-        self._scalar_p_and_k = torch.logical_and(scalar_p, scalar_k)
+        self._top_k_scalar = top_k_scalar
+        self._top_p_scalar = top_p_scalar
 
         self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5)
 
@@ -270,10 +268,10 @@ def forward(
 
         if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None:
             # If we have a scalar p and k, we can use the optimized version.
-            if self._scalar_p_and_k.any():
+            if self._top_k_scalar and self._top_p_scalar:
                 logits = self._apply_top_k_top_p_opt(logits,
-                                                     self._top_p_scalar.item(),
-                                                     self._top_k_scalar.item())
+                                                     self._top_p_scalar,
+                                                     self._top_k_scalar)
             else:
                 logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
                                             sampling_tensors.top_ks)
@@ -386,8 +384,13 @@ class ApplyToppTopkScalar:
     The main logic of this is in __call__
     This is a class instead of a function, just to keep track of
     the monotonic non-decreasing state _padded_k
+
+    To enable the duplicates that are outside of kth border,
+    set VLLM_HANDLE_TOPK_DUPLICATES to 1 or true.
     """
     _padded_k = 0
+    _handle_duplicates = os.getenv('VLLM_HANDLE_TOPK_DUPLICATES',
+                                   '0').lower() in ['1', 'true']
 
     def __init__(self, increment: int):
         self._increment = increment
@@ -397,12 +400,15 @@ def __call__(self, logits: torch.Tensor, p: float, k: int):
             ApplyToppTopkScalar._padded_k = min(k + self._increment,
                                                 logits.shape[1])
 
-        vals, idx = torch.topk(logits, k=ApplyToppTopkScalar._padded_k, \
-                    dim=1, sorted=True)
+        vals, idx = torch.topk(logits,
+                               k=ApplyToppTopkScalar._padded_k,
+                               dim=1,
+                               sorted=True)
 
         # this "if" checks if we have bucketed so much that
         # we have padded k upto shape of logits
-        if ApplyToppTopkScalar._padded_k != logits.shape[1]:
+        if self._handle_duplicates and \
+            ApplyToppTopkScalar._padded_k != logits.shape[1]:
             smallest_of_top_k = vals[:, k - 1]
             num_duplicates_of_smallest_of_topk = torch.sum(
                 logits == smallest_of_top_k.unsqueeze(1), 1)
@@ -427,9 +433,10 @@ def __call__(self, logits: torch.Tensor, p: float, k: int):
                     ApplyToppTopkScalar._padded_k + incr, logits.shape[1])
 
                 # recompute topk with expanded padded_k
-                vals, idx = torch.topk(logits, \
-                            k=ApplyToppTopkScalar._padded_k, \
-                            dim=1, sorted=True)
+                vals, idx = torch.topk(logits,
+                                       k=ApplyToppTopkScalar._padded_k,
+                                       dim=1,
+                                       sorted=True)
 
         idx = torch.fliplr(idx)
         vals = torch.fliplr(vals)
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
@@ -389,7 +389,8 @@ def from_sampling_metadata(
         vocab_size: int,
         device: torch.device,
         dtype: torch.dtype,
-    ) -> Tuple["SamplingTensors", bool, bool, bool]:
+    ) -> Tuple["SamplingTensors", bool, bool, bool, Optional[int],
+               Optional[float]]:
         prompt_tokens: List[array] = []
         output_tokens: List[array] = []
         top_ks: List[int] = []
@@ -476,6 +477,11 @@ def from_sampling_metadata(
                         prompt_tokens.append(seq_data.prompt_token_ids_array)
                         output_tokens.append(seq_data.output_token_ids_array)
 
+        top_k_scalar = top_ks[0] if do_top_p_top_k and all(
+            k == top_ks[0] for k in top_ks) else None
+        top_p_scalar = top_ps[0] if do_top_p_top_k and all(
+            p == top_ps[0] for p in top_ps) else None
+
         sampling_tensors = SamplingTensors.from_lists(
             temperatures,
             top_ps,
@@ -490,7 +496,8 @@ def from_sampling_metadata(
             device,
             dtype,
         )
-        return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p)
+        return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
+                top_k_scalar, top_p_scalar)
 
     @classmethod
     def from_lists(