vllm-project
diff --git a/‎benchmarks/kernels/benchmark_paged_attention.py
Lines changed: 3 additions & 3 deletions b/‎benchmarks/kernels/benchmark_paged_attention.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎csrc/activation.cpp
Lines changed: 0 additions & 28 deletions b/‎csrc/activation.cpp
Lines changed: 0 additions & 28 deletions
diff --git a/‎csrc/attention.cpp
Lines changed: 0 additions & 42 deletions b/‎csrc/attention.cpp
Lines changed: 0 additions & 42 deletions
diff --git a/‎csrc/cache.cpp renamed to ‎csrc/cache.h
Lines changed: 0 additions & 19 deletions b/‎csrc/cache.cpp renamed to ‎csrc/cache.h
Lines changed: 0 additions & 19 deletions
diff --git a/‎csrc/cuda_utils.cpp
Lines changed: 0 additions & 13 deletions b/‎csrc/cuda_utils.cpp
Lines changed: 0 additions & 13 deletions
diff --git a/‎csrc/cuda_utils.h
Lines changed: 5 additions & 0 deletions b/‎csrc/cuda_utils.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎csrc/layernorm.cpp
Lines changed: 0 additions & 24 deletions b/‎csrc/layernorm.cpp
Lines changed: 0 additions & 24 deletions
diff --git a/‎csrc/ops.h
Lines changed: 75 additions & 0 deletions b/‎csrc/ops.h
Lines changed: 75 additions & 0 deletions
diff --git a/‎csrc/pos_encoding.cpp
Lines changed: 0 additions & 16 deletions b/‎csrc/pos_encoding.cpp
Lines changed: 0 additions & 16 deletions
diff --git a/‎csrc/pybind.cpp
Lines changed: 80 additions & 0 deletions b/‎csrc/pybind.cpp
Lines changed: 80 additions & 0 deletions
diff --git a/‎csrc/quantization.cpp
Lines changed: 0 additions & 19 deletions b/‎csrc/quantization.cpp
Lines changed: 0 additions & 19 deletions
@@ -4,7 +4,7 @@
 
 import torch
 
-from vllm import attention_ops
+from vllm._C import ops
 
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@@ -98,7 +98,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
 
         for _ in range(num_iters):
             if version == "v1":
-                attention_ops.paged_attention_v1(
+                ops.paged_attention_v1(
                     output,
                     query,
                     key_cache,
@@ -112,7 +112,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
                     alibi_slopes,
                 )
             elif version == "v2":
-                attention_ops.paged_attention_v2(
+                ops.paged_attention_v2(
                     output,
                     exp_sums,
                     max_logits,
 
@@ -26,22 +26,3 @@ void gather_cached_kv(
   torch::Tensor& key_cache,
   torch::Tensor& value_cache,
   torch::Tensor& slot_mapping);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def(
-    "swap_blocks",
-    &swap_blocks,
-    "Swap in (out) the cache blocks from src to dst");
-  m.def(
-    "copy_blocks",
-    &copy_blocks,
-    "Copy the cache blocks from src to dst");
-  m.def(
-    "reshape_and_cache",
-    &reshape_and_cache,
-    "Reshape the key and value tensors and cache them");
-  m.def(
-    "gather_cached_kv",
-    &gather_cached_kv,
-    "Gather key and value from the cache into contiguous QKV tensors");
-}
@@ -0,0 +1,5 @@
+#include <torch/extension.h>
+
+int get_device_attribute(
+    int attribute,
+    int device_id);
@@ -0,0 +1,75 @@
+#include <torch/extension.h>
+
+void paged_attention_v1(
+  torch::Tensor& out,
+  torch::Tensor& query,
+  torch::Tensor& key_cache,
+  torch::Tensor& value_cache,
+  torch::Tensor& head_mapping,
+  float scale,
+  torch::Tensor& block_tables,
+  torch::Tensor& context_lens,
+  int block_size,
+  int max_context_len,
+  const c10::optional<torch::Tensor>& alibi_slopes);
+
+void paged_attention_v2(
+  torch::Tensor& out,
+  torch::Tensor& exp_sums,
+  torch::Tensor& max_logits,
+  torch::Tensor& tmp_out,
+  torch::Tensor& query,
+  torch::Tensor& key_cache,
+  torch::Tensor& value_cache,
+  torch::Tensor& head_mapping,
+  float scale,
+  torch::Tensor& block_tables,
+  torch::Tensor& context_lens,
+  int block_size,
+  int max_context_len,
+  const c10::optional<torch::Tensor>& alibi_slopes);
+
+void rms_norm(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& weight,
+  float epsilon);
+
+void fused_add_rms_norm(
+  torch::Tensor& input,
+  torch::Tensor& residual,
+  torch::Tensor& weight,
+  float epsilon);
+
+void rotary_embedding(
+  torch::Tensor& positions,
+  torch::Tensor& query,
+  torch::Tensor& key,
+  int head_size,
+  torch::Tensor& cos_sin_cache,
+  bool is_neox);
+
+void silu_and_mul(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
+void gelu_new(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
+void gelu_fast(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
+torch::Tensor awq_gemm(
+  torch::Tensor _in_feats,
+  torch::Tensor _kernel,
+  torch::Tensor _scaling_factors,
+  torch::Tensor _zeros,
+  int split_k_iters);
+
+void squeezellm_gemm(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor lookup_table);
@@ -0,0 +1,80 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // vLLM custom ops
+  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
+
+  // Attention ops
+  ops.def(
+    "paged_attention_v1",
+    &paged_attention_v1,
+    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
+  ops.def(
+    "paged_attention_v2",
+    &paged_attention_v2,
+    "PagedAttention V2.");
+
+  // Activation ops
+  ops.def(
+    "silu_and_mul",
+    &silu_and_mul,
+    "Activation function used in SwiGLU.");
+  ops.def(
+    "gelu_new",
+    &gelu_new,
+    "GELU implementation used in GPT-2.");
+  ops.def(
+    "gelu_fast",
+    &gelu_fast,
+    "Approximate GELU implementation.");
+
+  // Layernorm
+  ops.def(
+    "rms_norm",
+    &rms_norm,
+    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
+
+  ops.def(
+    "fused_add_rms_norm",
+    &fused_add_rms_norm,
+    "In-place fused Add and RMS Normalization");
+
+  // Rotary embedding
+  ops.def(
+    "rotary_embedding",
+    &rotary_embedding,
+    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
+
+  // Quantization ops
+  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
+
+  // Cache ops
+  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
+  cache_ops.def(
+    "swap_blocks",
+    &swap_blocks,
+    "Swap in (out) the cache blocks from src to dst");
+  cache_ops.def(
+    "copy_blocks",
+    &copy_blocks,
+    "Copy the cache blocks from src to dst");
+  cache_ops.def(
+    "reshape_and_cache",
+    &reshape_and_cache,
+    "Reshape the key and value tensors and cache them");
+  cache_ops.def(
+    "gather_cached_kv",
+    &gather_cached_kv,
+    "Gather key and value from the cache into contiguous QKV tensors");
+
+  // Cuda utils
+  pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
+  cuda_utils.def(
+    "get_device_attribute",
+    &get_device_attribute,
+    "Gets the specified device attribute.");
+}