|
| 1 | +#include <torch/extension.h> |
| 2 | + |
| 3 | +#include <cuda.h> |
| 4 | +#include <cuda_runtime.h> |
| 5 | + |
| 6 | +namespace extension_cpp { |
| 7 | + |
| 8 | +__global__ void muladd_kernel(int numel, const float* a, const float* b, float c, float* result) { |
| 9 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| 10 | + if (idx < numel) result[idx] = a[idx] * b[idx] + c; |
| 11 | +} |
| 12 | + |
| 13 | +at::Tensor mymuladd_cuda(const at::Tensor& a, const at::Tensor& b, double c) { |
| 14 | + TORCH_CHECK(a.sizes() == b.sizes()); |
| 15 | + TORCH_CHECK(a.dtype() == at::kFloat); |
| 16 | + TORCH_CHECK(b.dtype() == at::kFloat); |
| 17 | + TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA); |
| 18 | + TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA); |
| 19 | + at::Tensor a_contig = a.contiguous(); |
| 20 | + at::Tensor b_contig = b.contiguous(); |
| 21 | + at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options()); |
| 22 | + const float* a_ptr = a_contig.data_ptr<float>(); |
| 23 | + const float* b_ptr = b_contig.data_ptr<float>(); |
| 24 | + float* result_ptr = result.data_ptr<float>(); |
| 25 | + |
| 26 | + int numel = a_contig.numel(); |
| 27 | + muladd_kernel<<<(numel+255)/256, 256>>>(numel, a_ptr, b_ptr, c, result_ptr); |
| 28 | + return result; |
| 29 | +} |
| 30 | + |
| 31 | +__global__ void mul_kernel(int numel, const float* a, const float* b, float* result) { |
| 32 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| 33 | + if (idx < numel) result[idx] = a[idx] * b[idx]; |
| 34 | +} |
| 35 | + |
| 36 | +at::Tensor mymul_cuda(const at::Tensor& a, const at::Tensor& b) { |
| 37 | + TORCH_CHECK(a.sizes() == b.sizes()); |
| 38 | + TORCH_CHECK(a.dtype() == at::kFloat); |
| 39 | + TORCH_CHECK(b.dtype() == at::kFloat); |
| 40 | + TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA); |
| 41 | + TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA); |
| 42 | + at::Tensor a_contig = a.contiguous(); |
| 43 | + at::Tensor b_contig = b.contiguous(); |
| 44 | + at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options()); |
| 45 | + const float* a_ptr = a_contig.data_ptr<float>(); |
| 46 | + const float* b_ptr = b_contig.data_ptr<float>(); |
| 47 | + float* result_ptr = result.data_ptr<float>(); |
| 48 | + int numel = a_contig.numel(); |
| 49 | + mul_kernel<<<(numel+255)/256, 256>>>(numel, a_ptr, b_ptr, result_ptr); |
| 50 | + return result; |
| 51 | +} |
| 52 | + |
| 53 | +__global__ void add_kernel(int numel, const float* a, const float* b, float* result) { |
| 54 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| 55 | + if (idx < numel) result[idx] = a[idx] * b[idx]; |
| 56 | +} |
| 57 | + |
| 58 | +void myadd_out_cuda(const at::Tensor& a, const at::Tensor& b, at::Tensor& out) { |
| 59 | + TORCH_CHECK(a.sizes() == b.sizes()); |
| 60 | + TORCH_CHECK(b.sizes() == out.sizes()); |
| 61 | + TORCH_CHECK(a.dtype() == at::kFloat); |
| 62 | + TORCH_CHECK(b.dtype() == at::kFloat); |
| 63 | + TORCH_CHECK(out.dtype() == at::kFloat); |
| 64 | + TORCH_CHECK(out.is_contiguous()); |
| 65 | + TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA); |
| 66 | + TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA); |
| 67 | + TORCH_INTERNAL_ASSERT(out.device().type() == at::DeviceType::CUDA); |
| 68 | + at::Tensor a_contig = a.contiguous(); |
| 69 | + at::Tensor b_contig = b.contiguous(); |
| 70 | + const float* a_ptr = a_contig.data_ptr<float>(); |
| 71 | + const float* b_ptr = b_contig.data_ptr<float>(); |
| 72 | + float* result_ptr = out.data_ptr<float>(); |
| 73 | + int numel = a_contig.numel(); |
| 74 | + add_kernel<<<(numel+255)/256, 256>>>(numel, a_ptr, b_ptr, result_ptr); |
| 75 | +} |
| 76 | + |
| 77 | + |
| 78 | +// Registers CUDA implementations for mymuladd, mymul, myadd_out |
| 79 | +TORCH_LIBRARY_IMPL(extension_cpp, CUDA, m) { |
| 80 | + m.impl("mymuladd", &mymuladd_cuda); |
| 81 | + m.impl("mymul", &mymul_cuda); |
| 82 | + m.impl("myadd_out", &myadd_out_cuda); |
| 83 | +} |
| 84 | + |
| 85 | +} |
0 commit comments