doc: fix the description of logits cap in docstring (#299)

yzh119 · web-flow · commit c18745b933ab · 2024-06-14T02:36:44.000-07:00
The logits cap was applied to pre-attention logits, not attention
scores.
diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py
@@ -86,8 +86,8 @@ def single_decode_with_kv_cache(
         ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
         Defaults to ``NONE``.
     logits_cap : bool
-        Whether to apply logits cap to attention scores.
-        If ``True``, the attention scores will be capped according to formula (proposed in
+        Whether to apply logits cap to pre-attention logits.
+        If ``True``, the logits will be capped according to formula (proposed in
         Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
         Defaults to ``False``.
     q_scale : Optional[float]
@@ -199,8 +199,8 @@ def batch_decode_with_padded_kv_cache(
         ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
         Defaults to ``NONE``.
     logits_cap : bool
-        Whether to apply logits cap to attention scores.
-        If ``True``, the attention scores will be capped according to formula (proposed in
+        Whether to apply logits cap to pre-attention logits.
+        If ``True``, the logits will be capped according to formula (proposed in
         Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
         Defaults to ``False``.
     q_scale : Optional[float]
@@ -312,8 +312,8 @@ def batch_decode_with_padded_kv_cache_return_lse(
         ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
         Defaults to ``NONE``.
     logits_cap : bool
-        Whether to apply logits cap to attention scores.
-        If ``True``, the attention scores will be capped according to formula (proposed in
+        Whether to apply logits cap to pre-attention logits.
+        If ``True``, the logits will be capped according to formula (proposed in
         Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
         Defaults to ``False``.
     q_scale : Optional[float]
@@ -592,8 +592,8 @@ def begin_forward(
             ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
             Defaults to ``NONE``.
         logits_cap: bool
-            Whether to apply logits cap to attention scores.
-            If ``True``, the attention scores will be capped according to formula (proposed in
+            Whether to apply logits cap to pre-attention logits.
+            If ``True``, the logits will be capped according to formula (proposed in
             Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
             Defaults to ``False``.
         data_type : Union[str, torch.dtype]
@@ -704,8 +704,8 @@ def forward(
             ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
             Defaults to ``NONE``.
         logits_cap: bool
-            Whether to apply logits cap to attention scores.
-            If ``True``, the attention scores will be capped according to formula (proposed in
+            Whether to apply logits cap to pre-attention logits.
+            If ``True``, the logits will be capped according to formula (proposed in
             Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
             Defaults to ``False``.
         q_scale : Optional[float]
@@ -789,8 +789,8 @@ def forward_return_lse(
             ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
             Defaults to ``NONE``.
         logits_cap: bool
-            Whether to apply logits cap to attention scores.
-            If ``True``, the attention scores will be capped according to formula (proposed in
+            Whether to apply logits cap to pre-attention logits.
+            If ``True``, the logits will be capped according to formula (proposed in
             Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
             Defaults to ``False``.
         q_scale : Optional[float]
diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py
@@ -96,8 +96,8 @@ def single_prefill_with_kv_cache(
         ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
         Default is ``NONE``.
     logits_cap : bool
-        Whether to apply logits cap to attention scores.
-        If ``True``, the attention scores will be capped according to formula (proposed in
+        Whether to apply logits cap to pre-attention logits.
+        If ``True``, the logits will be capped according to formula (proposed in
         Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
         Defaults to ``False``.
     allow_fp16_qk_reduction : bool
@@ -240,8 +240,8 @@ def single_prefill_with_kv_cache_return_lse(
         ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
         Default is ``NONE``.
     logits_cap : bool
-        Whether to apply logits cap to attention scores.
-        If ``True``, the attention scores will be capped according to formula (proposed in
+        Whether to apply logits cap to pre-attention logits.
+        If ``True``, the logits will be capped according to formula (proposed in
         Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
         Defaults to ``False``.
     allow_fp16_qk_reduction : bool
@@ -770,8 +770,8 @@ def forward(
             ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
             Default is ``NONE``.
         logits_cap : bool
-            Whether to apply logits cap to attention scores.
-            If ``True``, the attention scores will be capped according to formula (proposed in
+            Whether to apply logits cap to pre-attention logits, 
+            If ``True``, the logits will be capped according to formula (proposed in
             Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
             Defaults to ``False``.
         allow_fp16_qk_reduction : bool
@@ -874,8 +874,8 @@ def forward_return_lse(
             ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
             Default is ``NONE``.
         logits_cap : bool
-            Whether to apply logits cap to attention scores.
-            If ``True``, the attention scores will be capped according to formula (proposed in
+            Whether to apply logits cap to pre-attention logits.
+            If ``True``, the logits will be capped according to formula (proposed in
             Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
             Defaults to ``False``.
         allow_fp16_qk_reduction : bool
@@ -1276,8 +1276,8 @@ def forward(
             ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
             Default is ``NONE``.
         logits_cap : bool
-            Whether to apply logits cap to attention scores.
-            If ``True``, the attention scores will be capped according to formula (proposed in
+            Whether to apply logits cap to pre-attention logits.
+            If ``True``, the logits will be capped according to formula (proposed in
             Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
             Defaults to ``False``.
         allow_fp16_qk_reduction : bool
@@ -1378,8 +1378,8 @@ def forward_return_lse(
             ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
             Default is ``NONE``.
         logits_cap : bool
-            Whether to apply logits cap to attention scores.
-            If ``True``, the attention scores will be capped according to formula (proposed in
+            Whether to apply logits cap to pre-attention logits.
+            If ``True``, the logits will be capped according to formula (proposed in
             Grok-1): :math:`30 \times \mathrm{tanh}(x / 30)`, where :math:`x` is the input logits.
             Defaults to ``False``.
         allow_fp16_qk_reduction : bool