pytorch
diff --git a/‎README.md
+3-1 b/‎README.md
+3-1
diff --git a/‎extension_cpp/csrc/cuda/lltm_cuda.cu
-183 b/‎extension_cpp/csrc/cuda/lltm_cuda.cu
-183
diff --git a/‎extension_cpp/csrc/cuda/muladd.cu
+85 b/‎extension_cpp/csrc/cuda/muladd.cu
+85
diff --git a/‎extension_cpp/csrc/lltm.cpp
-101 b/‎extension_cpp/csrc/lltm.cpp
-101
@@ -2,9 +2,11 @@
 
 An example of writing a C++/CUDA extension for PyTorch. See
 [here](http://pytorch.org/tutorials/advanced/cpp_extension.html) for the accompanying tutorial.
-This repo demonstrates how to write an example `extension_cpp.ops.lltm`
+This repo demonstrates how to write an example `extension_cpp.ops.mymuladd`
 custom op that has both custom CPU and CUDA kernels.
 
+The examples in this repo work with PyTorch 2.4+.
+
 To build:
 ```
 pip install .
 
@@ -0,0 +1,85 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace extension_cpp {
+
+__global__ void muladd_kernel(int numel, const float* a, const float* b, float c, float* result) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) result[idx] = a[idx] * b[idx] + c;
+}
+
+at::Tensor mymuladd_cuda(const at::Tensor& a, const at::Tensor& b, double c) {
+  TORCH_CHECK(a.sizes() == b.sizes());
+  TORCH_CHECK(a.dtype() == at::kFloat);
+  TORCH_CHECK(b.dtype() == at::kFloat);
+  TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA);
+  at::Tensor a_contig = a.contiguous();
+  at::Tensor b_contig = b.contiguous();
+  at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options());
+  const float* a_ptr = a_contig.data_ptr<float>();
+  const float* b_ptr = b_contig.data_ptr<float>();
+  float* result_ptr = result.data_ptr<float>();
+
+  int numel = a_contig.numel();
+  muladd_kernel<<<(numel+255)/256, 256>>>(numel, a_ptr, b_ptr, c, result_ptr);
+  return result;
+}
+
+__global__ void mul_kernel(int numel, const float* a, const float* b, float* result) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) result[idx] = a[idx] * b[idx];
+}
+
+at::Tensor mymul_cuda(const at::Tensor& a, const at::Tensor& b) {
+  TORCH_CHECK(a.sizes() == b.sizes());
+  TORCH_CHECK(a.dtype() == at::kFloat);
+  TORCH_CHECK(b.dtype() == at::kFloat);
+  TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA);
+  at::Tensor a_contig = a.contiguous();
+  at::Tensor b_contig = b.contiguous();
+  at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options());
+  const float* a_ptr = a_contig.data_ptr<float>();
+  const float* b_ptr = b_contig.data_ptr<float>();
+  float* result_ptr = result.data_ptr<float>();
+  int numel = a_contig.numel();
+  mul_kernel<<<(numel+255)/256, 256>>>(numel, a_ptr, b_ptr, result_ptr);
+  return result;
+}
+
+__global__ void add_kernel(int numel, const float* a, const float* b, float* result) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) result[idx] = a[idx] * b[idx];
+}
+
+void myadd_out_cuda(const at::Tensor& a, const at::Tensor& b, at::Tensor& out) {
+  TORCH_CHECK(a.sizes() == b.sizes());
+  TORCH_CHECK(b.sizes() == out.sizes());
+  TORCH_CHECK(a.dtype() == at::kFloat);
+  TORCH_CHECK(b.dtype() == at::kFloat);
+  TORCH_CHECK(out.dtype() == at::kFloat);
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(out.device().type() == at::DeviceType::CUDA);
+  at::Tensor a_contig = a.contiguous();
+  at::Tensor b_contig = b.contiguous();
+  const float* a_ptr = a_contig.data_ptr<float>();
+  const float* b_ptr = b_contig.data_ptr<float>();
+  float* result_ptr = out.data_ptr<float>();
+  int numel = a_contig.numel();
+  add_kernel<<<(numel+255)/256, 256>>>(numel, a_ptr, b_ptr, result_ptr);
+}
+
+
+// Registers CUDA implementations for mymuladd, mymul, myadd_out
+TORCH_LIBRARY_IMPL(extension_cpp, CUDA, m) {
+  m.impl("mymuladd", &mymuladd_cuda);
+  m.impl("mymul", &mymul_cuda);
+  m.impl("myadd_out", &myadd_out_cuda);
+}
+
+}