[Runtime] Use preferred host memory (pinned memory) in KV cache (#17036)

MasterJH5574 · web-flow · commit 71f7af7985e2 · 2024-05-29T17:14:17.000-04:00
This PR updates the PagedKVCache with the pinned memory support,
which can reduce the copy overhead between CPU and GPU.

This PR also bumps FlashInfer version, which now supports
* specifying kernels to build via cmake,
* pinned memory as host memory.

We also update CMakeLists.txt and config.cmake to include the
FlashInfer compile options. Prior to this PR, the kernels being
built is hardcoded in FlashInfer header files.
diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit f978e02565d7157d57803eb4153369e046fc4106
+Subproject commit 7e9cc7ff42ca283c317061a877305d09a395fad2
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -960,13 +960,13 @@ option(USE_FLASHINFER "Build TVM with FlashInfer" OFF)
 if (USE_FLASHINFER STREQUAL "ON")
   message(STATUS "Build with FlashInfer")
   set(FLASHINFER_TVM_BINDING ON)
-  set(FLASHINFER_TVM_HOME ${PROJECT_SOURCE_DIR})
-  set(FLASHINFER_ENABLE_FP8 OFF)
-  set(FLASHINFER_ENABLE_BF16 OFF)
+  set(FLASHINFER_TVM_SOURCE_DIR ${PROJECT_SOURCE_DIR})
   set(FLASHINFER_PREFILL OFF)
   set(FLASHINFER_DECODE OFF)
   set(FLASHINFER_PAGE OFF)
   set(FLASHINFER_CASCADE OFF)
+  set(FLASHINFER_SAMPLING OFF)
+  set(FLASHINFER_NORM OFF)
   add_subdirectory(3rdparty/flashinfer)
 else ()
   message(STATUS "Build without FlashInfer")
diff --git a/cmake/config.cmake b/cmake/config.cmake
@@ -444,6 +444,19 @@ set(USE_GTEST AUTO)
 # Need to have USE_CUDA=ON
 set(USE_CUTLASS OFF)
 
+# Whether to enable FlashInfer or not.
+set(USE_FLASHINFER OFF)
+# Options for FlashInfer kernel compilation.
+set(FLASHINFER_ENABLE_FP8 OFF)
+set(FLASHINFER_ENABLE_BF16 OFF)
+set(FLASHINFER_GEN_GROUP_SIZES 1 4 6 8)
+set(FLASHINFER_GEN_PAGE_SIZES 16)
+set(FLASHINFER_GEN_HEAD_DIMS 128)
+set(FLASHINFER_GEN_KV_LAYOUTS 0 1)
+set(FLASHINFER_GEN_POS_ENCODING_MODES 0 1)
+set(FLASHINFER_GEN_ALLOW_FP16_QK_REDUCTIONS "false")
+set(FLASHINFER_GEN_CASUALS "false" "true")
+
 # Enable to show a summary of TVM options
 set(SUMMARIZE OFF)
 
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
@@ -534,6 +534,23 @@ inline bool NDArray::Load(dmlc::Stream* strm) {
   return true;
 }
 
+/*!
+ * \brief Get the preferred host device from the input device.
+ * - For CUDA and ROCm, CUDAHost and ROCMHost will be returned for pinned memory,
+ * since pinned memory reduces copy overhead.
+ * - For other devices, CPU is returned as a fallback.
+ */
+inline Device GetPreferredHostDevice(Device device) {
+  if (device.device_type == DLDeviceType::kDLCUDA) {
+    return Device{DLDeviceType::kDLCUDAHost, 0};
+  } else if (device.device_type == DLDeviceType::kDLROCM) {
+    return Device{DLDeviceType::kDLROCMHost, 0};
+  } else {
+    // Fallback to CPU.
+    return Device{DLDeviceType::kDLCPU, 0};
+  }
+}
+
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/relax_vm/paged_kv_cache.cc b/src/runtime/relax_vm/paged_kv_cache.cc