flashinfer-ai
diff --git a/‎CMakeLists.txt
+17-23 b/‎CMakeLists.txt
+17-23
diff --git a/‎aot_build_utils/generate.py
+22-16 b/‎aot_build_utils/generate.py
+22-16
diff --git a/‎aot_build_utils/generate_aot_default_additional_params_header.py
+148 b/‎aot_build_utils/generate_aot_default_additional_params_header.py
+148
@@ -66,7 +66,7 @@ flashinfer_option(FLASHINFER_TVM_SOURCE_DIR
 flashinfer_option(FLASHINFER_GEN_HEAD_DIMS "Head dims to enable" 64 128 256)
 flashinfer_option(FLASHINFER_GEN_POS_ENCODING_MODES "Pos encodings to enable" 0
                   1 2)
-flashinfer_option(FLASHINFER_GEN_ALLOW_FP16_QK_REDUCTIONS
+flashinfer_option(FLASHINFER_GEN_USE_FP16_QK_REDUCTIONS
                   "QK reductions to enable" "false" "true")
 flashinfer_option(FLASHINFER_GEN_MASK_MODES "Mask modes to enable" 0 1 2)
 
@@ -126,34 +126,28 @@ endif(FLASHINFER_ENABLE_BF16)
 # generate kernel inst
 set(HEAD_DIMS ${FLASHINFER_GEN_HEAD_DIMS})
 set(POS_ENCODING_MODES ${FLASHINFER_GEN_POS_ENCODING_MODES})
-set(ALLOW_FP16_QK_REDUCTIONS ${FLASHINFER_GEN_ALLOW_FP16_QK_REDUCTIONS})
+set(USE_FP16_QK_REDUCTIONS ${FLASHINFER_GEN_USE_FP16_QK_REDUCTIONS})
 set(MASK_MODES ${FLASHINFER_GEN_MASK_MODES})
 
 # log options
 message(STATUS "FLASHINFER_HEAD_DIMS=${HEAD_DIMS}")
 message(STATUS "FLASHINFER_POS_ENCODING_MODES=${POS_ENCODING_MODES}")
-message(
-  STATUS "FLASHINFER_ALLOW_FP16_QK_REDUCTIONS=${ALLOW_FP16_QK_REDUCTIONS}")
+message(STATUS "FLASHINFER_USE_FP16_QK_REDUCTIONS=${USE_FP16_QK_REDUCTIONS}")
 message(STATUS "FLASHINFER_MASK_MODES=${MASK_MODES}")
 
 file(MAKE_DIRECTORY ${PROJECT_SOURCE_DIR}/src/generated)
 
 set(AOT_GENERATE_COMMAND
-  ${Python3_EXECUTABLE}
-  -m aot_build_utils.generate
-  --path ${PROJECT_SOURCE_DIR}/src/generated
-  --head_dims ${HEAD_DIMS}
-  --pos_encoding_modes ${POS_ENCODING_MODES}
-  --allow_fp16_qk_reductions ${ALLOW_FP16_QK_REDUCTIONS}
-  --mask_modes ${MASK_MODES}
-  --enable_f16 ${FLASHINFER_ENABLE_F16}
-  --enable_bf16 ${FLASHINFER_ENABLE_BF16}
-  --enable_fp8_e4m3 ${FLASHINFER_ENABLE_FP8_E4M3}
-  --enable_fp8_e5m2 ${FLASHINFER_ENABLE_FP8_E5M2})
-
-execute_process(
-  COMMAND ${AOT_GENERATE_COMMAND}
-  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+    ${Python3_EXECUTABLE} -m aot_build_utils.generate --path
+    ${PROJECT_SOURCE_DIR}/src/generated --head_dims ${HEAD_DIMS}
+    --pos_encoding_modes ${POS_ENCODING_MODES} --use_fp16_qk_reductions
+    ${USE_FP16_QK_REDUCTIONS} --mask_modes ${MASK_MODES} --enable_f16
+    ${FLASHINFER_ENABLE_F16} --enable_bf16 ${FLASHINFER_ENABLE_BF16}
+    --enable_fp8_e4m3 ${FLASHINFER_ENABLE_FP8_E4M3} --enable_fp8_e5m2
+    ${FLASHINFER_ENABLE_FP8_E5M2})
+
+execute_process(COMMAND ${AOT_GENERATE_COMMAND}
+                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
 
 file(GLOB_RECURSE FLASHINFER_GENERATORS
      ${PROJECT_SOURCE_DIR}/aot_build_utils/*.py)
@@ -175,13 +169,13 @@ add_custom_target(dispatch_inc DEPENDS ${DISPATCH_INC_FILE})
 
 add_library(decode_kernels STATIC ${DECODE_KERNELS_SRCS})
 target_include_directories(decode_kernels PRIVATE ${FLASHINFER_INCLUDE_DIR})
-target_compile_options(decode_kernels PRIVATE
-  -Xcompiler=-fPIC --fatbin-options -compress-all)
+target_compile_options(decode_kernels PRIVATE -Xcompiler=-fPIC --fatbin-options
+                                              -compress-all)
 
 add_library(prefill_kernels STATIC ${PREFILL_KERNELS_SRCS})
 target_include_directories(prefill_kernels PRIVATE ${FLASHINFER_INCLUDE_DIR})
-target_compile_options(prefill_kernels PRIVATE
-  -Xcompiler=-fPIC --fatbin-options -compress-all)
+target_compile_options(prefill_kernels PRIVATE -Xcompiler=-fPIC
+                                               --fatbin-options -compress-all)
 
 if(FLASHINFER_DECODE)
   message(STATUS "Compile single decode kernel benchmarks.")
 
@@ -20,6 +20,7 @@
 from typing import List
 
 from . import (
+    generate_aot_default_additional_params_header,
     generate_batch_paged_decode_inst,
     generate_batch_paged_prefill_inst,
     generate_batch_ragged_prefill_inst,
@@ -38,7 +39,7 @@ def write_if_different(path: Path, content: str) -> None:
     path: Path = args.path
     head_dims: List[int] = args.head_dims
     pos_encoding_modes: List[int] = args.pos_encoding_modes
-    allow_fp16_qk_reductions: List[int] = args.allow_fp16_qk_reductions
+    use_fp16_qk_reductions: List[int] = args.use_fp16_qk_reductions
     mask_modes: List[int] = args.mask_modes
     enable_f16: bool = args.enable_f16
     enable_bf16: bool = args.enable_bf16
@@ -54,12 +55,17 @@ def write_if_different(path: Path, content: str) -> None:
             argparse.Namespace(
                 head_dims=head_dims,
                 pos_encoding_modes=pos_encoding_modes,
-                allow_fp16_qk_reductions=allow_fp16_qk_reductions,
+                use_fp16_qk_reductions=use_fp16_qk_reductions,
                 mask_modes=mask_modes,
             )
         ),
     )
 
+    write_if_different(
+        path / "aot_default_additional_params.h",
+        generate_aot_default_additional_params_header.get_aot_default_additional_params_header_str(),
+    )
+
     idtypes = ["i32"]
     prefill_dtypes = []
     decode_dtypes = []
@@ -150,22 +156,22 @@ def write_if_different(path: Path, content: str) -> None:
     for (
         head_dim,
         pos_encoding_mode,
-        allow_fp16_qk_reduction,
+        use_fp16_qk_reduction,
         mask_mode,
     ) in product(
         head_dims,
         pos_encoding_modes,
-        allow_fp16_qk_reductions,
+        use_fp16_qk_reductions,
         mask_modes,
     ):
         for dtype_q, dtype_kv in list(zip(prefill_dtypes, prefill_dtypes)) + list(
             product(prefill_dtypes, fp8_dtypes)
         ):
-            fname = f"single_prefill_head_{head_dim}_posenc_{pos_encoding_mode}_fp16qkred_{allow_fp16_qk_reduction}_mask_{mask_mode}_dtypeq_{dtype_q}_dtypekv_{dtype_kv}_dtypeout_{dtype_q}.cu"
+            fname = f"single_prefill_head_{head_dim}_posenc_{pos_encoding_mode}_fp16qkred_{use_fp16_qk_reduction}_mask_{mask_mode}_dtypeq_{dtype_q}_dtypekv_{dtype_kv}_dtypeout_{dtype_q}.cu"
             content = generate_single_prefill_inst.get_cu_file_str(
                 head_dim,
                 pos_encoding_mode,
-                allow_fp16_qk_reduction,
+                use_fp16_qk_reduction,
                 mask_mode,
                 dtype_q,  # dtype_q
                 dtype_kv,  # dtype_kv
@@ -184,7 +190,7 @@ def write_if_different(path: Path, content: str) -> None:
                             f"posenc_{pos_encoding_mode}_"
                             f"use_swa_{use_sliding_window}_"
                             f"use_logits_cap_{use_logits_soft_cap}_"
-                            f"f16qk_{bool(allow_fp16_qk_reduction)}"
+                            f"f16qk_{bool(use_fp16_qk_reduction)}"
                         )
             write_if_different(path / fname, content)
 
@@ -193,24 +199,24 @@ def write_if_different(path: Path, content: str) -> None:
     for (
         head_dim,
         pos_encoding_mode,
-        allow_fp16_qk_reduction,
+        use_fp16_qk_reduction,
         mask_mode,
         idtype,
     ) in product(
         head_dims,
         pos_encoding_modes,
-        allow_fp16_qk_reductions,
+        use_fp16_qk_reductions,
         mask_modes,
         idtypes,
     ):
         for dtype_q, dtype_kv in list(zip(prefill_dtypes, prefill_dtypes)) + list(
             product(prefill_dtypes, fp8_dtypes)
         ):
-            fname = f"batch_paged_prefill_head_{head_dim}_posenc_{pos_encoding_mode}_fp16qkred_{allow_fp16_qk_reduction}_mask_{mask_mode}_dtypeq_{dtype_q}_dtypekv_{dtype_kv}_dtypeout_{dtype_q}_idtype_{idtype}.cu"
+            fname = f"batch_paged_prefill_head_{head_dim}_posenc_{pos_encoding_mode}_fp16qkred_{use_fp16_qk_reduction}_mask_{mask_mode}_dtypeq_{dtype_q}_dtypekv_{dtype_kv}_dtypeout_{dtype_q}_idtype_{idtype}.cu"
             content = generate_batch_paged_prefill_inst.get_cu_file_str(
                 head_dim,
                 pos_encoding_mode,
-                allow_fp16_qk_reduction,
+                use_fp16_qk_reduction,
                 mask_mode,
                 dtype_q,  # dtype_q
                 dtype_kv,  # dtype_kv
@@ -219,11 +225,11 @@ def write_if_different(path: Path, content: str) -> None:
             )
             write_if_different(path / fname, content)
 
-            fname = f"batch_ragged_prefill_head_{head_dim}_posenc_{pos_encoding_mode}_fp16qkred_{allow_fp16_qk_reduction}_mask_{mask_mode}_dtypeq_{dtype_q}_dtypekv_{dtype_kv}_dtypeout_{dtype_q}_idtype_{idtype}.cu"
+            fname = f"batch_ragged_prefill_head_{head_dim}_posenc_{pos_encoding_mode}_fp16qkred_{use_fp16_qk_reduction}_mask_{mask_mode}_dtypeq_{dtype_q}_dtypekv_{dtype_kv}_dtypeout_{dtype_q}_idtype_{idtype}.cu"
             content = generate_batch_ragged_prefill_inst.get_cu_file_str(
                 head_dim,
                 pos_encoding_mode,
-                allow_fp16_qk_reduction,
+                use_fp16_qk_reduction,
                 mask_mode,
                 dtype_q,  # dtype_q
                 dtype_kv,  # dtype_kv
@@ -246,7 +252,7 @@ def write_if_different(path: Path, content: str) -> None:
                             f"posenc_{pos_encoding_mode}_"
                             f"use_swa_{sliding_window}_"
                             f"use_logits_cap_{logits_soft_cap}_"
-                            f"f16qk_{bool(allow_fp16_qk_reduction)}"
+                            f"f16qk_{bool(use_fp16_qk_reduction)}"
                         )
 
     return (
@@ -273,7 +279,7 @@ def write_if_different(path: Path, content: str) -> None:
         help="Position encoding modes",
     )
     parser.add_argument(
-        "--allow_fp16_qk_reductions",
+        "--use_fp16_qk_reductions",
         type=lambda x: x if isinstance(x, int) else int(x.lower() == "true"),
         required=True,
         nargs="+",
@@ -287,7 +293,7 @@ def write_if_different(path: Path, content: str) -> None:
         help="Mask modes",
     )
     parser.add_argument(
-        "--enable_fp16",
+        "--enable_f16",
         type=lambda x: x if isinstance(x, int) else x.lower() == "true",
         required=True,
         nargs="+",
 
@@ -0,0 +1,148 @@
+"""
+Copyright (c) 2024 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def generate_macro_entry(
+    macro_prefix,
+    additional_tensor_names,
+    additional_tensor_dtypes,
+    additional_scalar_names,
+    additional_scalar_dtypes,
+    is_sm90_template: bool = False,
+) -> str:
+    # NOTE(Zihao): mostly copy-paste from generate_additional_params in flashinfer.jit.attention.py
+    additional_func_params = "".join(
+        [
+            (
+                f", std::optional<at::Tensor> {var}"
+                if var.startswith("maybe")
+                else f", at::Tensor {var}"
+            )
+            for var in additional_tensor_names
+        ]
+        + [
+            f", {dtype} {var}"
+            for dtype, var in zip(additional_scalar_dtypes, additional_scalar_names)
+        ]
+    )
+    if is_sm90_template:
+        additional_params_setter = " \\\n".join(
+            [
+                (
+                    f"params.additional_params.{var} = {var} ? static_cast<{dtype}*>({var}->data_ptr()): nullptr;"
+                    if var.startswith("maybe")
+                    else f"params.additional_params.{var} = static_cast<{dtype}*>({var}.data_ptr());"
+                )
+                for dtype, var in zip(additional_tensor_dtypes, additional_tensor_names)
+            ]
+            + [
+                f"params.additional_params.{var} = {var};"
+                for var in additional_scalar_names
+            ]
+        )
+    else:
+        additional_params_setter = " \\\n".join(
+            [
+                (
+                    f"params.{var} = {var} ? static_cast<{dtype}*>({var}->data_ptr()): nullptr;"
+                    if var.startswith("maybe")
+                    else f"params.{var} = static_cast<{dtype}*>({var}.data_ptr());"
+                )
+                for dtype, var in zip(additional_tensor_dtypes, additional_tensor_names)
+            ]
+            + [f"params.{var} = {var};" for var in additional_scalar_names]
+        )
+    return f"""#define {macro_prefix}_ADDITIONAL_FUNC_PARAMS {additional_func_params}
+
+#define {macro_prefix}_ADDITIONAL_PARAMS_SETTER {additional_params_setter}
+
+"""
+
+
+def get_aot_default_additional_params_header_str() -> str:
+    ret = ""
+
+    ret += generate_macro_entry(
+        "SINGLE_DECODE",
+        ["maybe_alibi_slopes"],  # additional_tensor_names
+        ["float"],  # additional_tensor_dtypes
+        [
+            "logits_soft_cap",
+            "sm_scale",
+            "rope_rcp_scale",
+            "rope_rcp_theta",
+        ],  # additional_scalar_names
+        ["float", "float", "float", "float"],  # additional_scalar_dtypes
+    )
+
+    ret += generate_macro_entry(
+        "SINGLE_PREFILL",
+        ["maybe_custom_mask", "maybe_alibi_slopes"],
+        ["uint8_t", "float"],
+        [
+            "logits_soft_cap",
+            "sm_scale",
+            "rope_rcp_scale",
+            "rope_rcp_theta",
+        ],
+        ["float", "float", "float", "float"],
+    )
+
+    ret += generate_macro_entry(
+        "SINGLE_PREFILL_SM90",
+        [],
+        [],
+        ["logits_soft_cap", "sm_scale"],
+        ["float", "float"],
+        is_sm90_template=True,
+    )
+
+    ret += generate_macro_entry(
+        "BATCH_DECODE",
+        ["maybe_alibi_slopes"],  # additional_tensor_names
+        ["float"],  # additional_tensor_dtypes
+        [
+            "logits_soft_cap",
+            "sm_scale",
+            "rope_rcp_scale",
+            "rope_rcp_theta",
+        ],  # additional_scalar_names
+        ["float", "float", "float", "float"],  # additional_scalar_dtypes
+    )
+
+    ret += generate_macro_entry(
+        "BATCH_PREFILL",
+        ["maybe_custom_mask", "maybe_mask_indptr", "maybe_alibi_slopes"],
+        ["uint8_t", "int32_t", "float"],
+        [
+            "logits_soft_cap",
+            "sm_scale",
+            "rope_rcp_scale",
+            "rope_rcp_theta",
+        ],
+        ["float", "float", "float", "float"],
+    )
+
+    ret += generate_macro_entry(
+        "BATCH_PREFILL_SM90",
+        [],
+        [],
+        ["logits_soft_cap", "sm_scale"],
+        ["float", "float"],
+        is_sm90_template=True,
+    )
+
+    return ret