-
Notifications
You must be signed in to change notification settings - Fork 226
Adding new cmake/scikitbuild core based workflow #85
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
__pycache__/ | ||
.cache | ||
*.i | ||
*.ii | ||
*.gpu | ||
*.ptx | ||
*.cubin | ||
*.fatbin | ||
build/** | ||
driss_torch/lib/** | ||
compile_commands.json | ||
benchmarks/data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# See https://pre-commit.com for more information | ||
# See https://pre-commit.com/hooks.html for more hooks | ||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v3.2.0 | ||
hooks: | ||
- id: trailing-whitespace | ||
- id: end-of-file-fixer | ||
- id: check-yaml | ||
- id: check-added-large-files | ||
|
||
- repo: https://github.com/omnilib/ufmt | ||
rev: v2.1.0 | ||
hooks: | ||
- id: ufmt | ||
additional_dependencies: | ||
- black == 23.3.0 | ||
- usort == 1.0.6 | ||
- ufmt == 2.1.0 | ||
- libcst == 1.0.1 | ||
|
||
# missing host field? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix |
||
# - repo: https://github.com/pre-commit/mirrors-clang-format | ||
# rev: v17.0.5 | ||
# hooks: | ||
# - id: clang-format |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
cmake_minimum_required(VERSION 3.26 FATAL_ERROR) | ||
|
||
project( | ||
${SKBUILD_PROJECT_NAME} | ||
VERSION ${SKBUILD_PROJECT_VERSION} | ||
LANGUAGES CXX CUDA) | ||
|
||
# Set the C++ standard for all targets | ||
set(CMAKE_CXX_STANDARD 20) # This might be unsafe since pytorch use std17 | ||
set(CMAKE_CXX_STANDARD_REQUIRED ON) | ||
|
||
# Enable better clangd support | ||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) | ||
|
||
find_package(Python REQUIRED COMPONENTS Interpreter Development) | ||
execute_process( | ||
COMMAND "${Python3_EXECUTABLE}" "-c" "import torch;print(torch.utils.cmake_prefix_path)" | ||
OUTPUT_VARIABLE PT_CMAKE_PREFIX | ||
COMMAND_ECHO STDOUT | ||
OUTPUT_STRIP_TRAILING_WHITESPACE | ||
COMMAND_ERROR_IS_FATAL ANY | ||
) | ||
|
||
# cache CUDA_ARCHITECTURES, which seems to be reset by Torch | ||
set(TMP_STORE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont think I need this, however pytorch does deviate from regualre cuda cmake builds |
||
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH};${PT_CMAKE_PREFIX}) | ||
|
||
find_package(Torch REQUIRED CONFIG) | ||
|
||
# simple_cuda source files | ||
file(GLOB_RECURSE CU_SOURCES csrc/*.cu) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not some conditionals on building cuda sources |
||
file(GLOB_RECURSE CPP_SOURCES csrc/*.cpp) | ||
MESSAGE(STATUS "CU_SOURCES: ${CU_SOURCES}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change the message to debug |
||
MESSAGE(STATUS "CPP_SOURCES: ${CPP_SOURCES}") | ||
|
||
add_library(${SKBUILD_PROJECT_NAME} SHARED | ||
${CU_SOURCES} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here for conditional |
||
${CPP_SOURCES} | ||
) | ||
|
||
# Set the library output directory, I think this makes ninja builds work | ||
set_target_properties(${SKBUILD_PROJECT_NAME} PROPERTIES | ||
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/${SKBUILD_PROJECT_NAME}/lib" | ||
) | ||
# Add include directories to the library | ||
target_include_directories(${SKBUILD_PROJECT_NAME} PUBLIC src/include) | ||
|
||
# Link the library to the Torch library | ||
target_link_libraries(${SKBUILD_PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} Python::Python) | ||
|
||
# Install the library to the wheel distribution | ||
install(TARGETS ${SKBUILD_PROJECT_NAME} | ||
LIBRARY DESTINATION ${SKBUILD_PROJECT_NAME}/lib | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Extension Template | ||
|
||
This is a template for creating a new extension. | ||
It contains the basic structure and files needed to create a new extension for pytorch written in C++ or CUDA. | ||
|
||
|
||
### Build and install the extension | ||
|
||
```bash | ||
pip install -v --no-build-isolation -e . | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#include <torch/extension.h> | ||
|
||
#include <vector> | ||
|
||
// s'(z) = (1 - s(z)) * s(z) | ||
torch::Tensor d_sigmoid(torch::Tensor z) { | ||
auto s = torch::sigmoid(z); | ||
return (1 - s) * s; | ||
} | ||
|
||
// tanh'(z) = 1 - tanh^2(z) | ||
torch::Tensor d_tanh(torch::Tensor z) { | ||
return 1 - z.tanh().pow(2); | ||
} | ||
|
||
// elu'(z) = relu'(z) + { alpha * exp(z) if (alpha * (exp(z) - 1)) < 0, else 0} | ||
torch::Tensor d_elu(torch::Tensor z, torch::Scalar alpha = 1.0) { | ||
auto e = z.exp(); | ||
auto mask = (alpha * (e - 1)) < 0; | ||
return (z > 0).type_as(z) + mask.type_as(z) * (alpha * e); | ||
} | ||
|
||
std::tuple<torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor> lltm_forward( | ||
torch::Tensor input, | ||
torch::Tensor weights, | ||
torch::Tensor bias, | ||
torch::Tensor old_h, | ||
torch::Tensor old_cell) { | ||
auto X = torch::cat({old_h, input}, /*dim=*/1); | ||
|
||
auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1)); | ||
auto gates = gate_weights.chunk(3, /*dim=*/1); | ||
|
||
auto input_gate = torch::sigmoid(gates[0]); | ||
auto output_gate = torch::sigmoid(gates[1]); | ||
auto candidate_cell = torch::elu(gates[2], /*alpha=*/1.0); | ||
|
||
auto new_cell = old_cell + candidate_cell * input_gate; | ||
auto new_h = torch::tanh(new_cell) * output_gate; | ||
|
||
return {new_h, | ||
new_cell, | ||
input_gate, | ||
output_gate, | ||
candidate_cell, | ||
X, | ||
gate_weights}; | ||
} | ||
|
||
std::tuple<torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor> lltm_backward( | ||
torch::Tensor grad_h, | ||
torch::Tensor grad_cell, | ||
torch::Tensor new_cell, | ||
torch::Tensor input_gate, | ||
torch::Tensor output_gate, | ||
torch::Tensor candidate_cell, | ||
torch::Tensor X, | ||
torch::Tensor gate_weights, | ||
torch::Tensor weights) { | ||
auto d_output_gate = torch::tanh(new_cell) * grad_h; | ||
auto d_tanh_new_cell = output_gate * grad_h; | ||
auto d_new_cell = d_tanh(new_cell) * d_tanh_new_cell + grad_cell; | ||
|
||
auto d_old_cell = d_new_cell; | ||
auto d_candidate_cell = input_gate * d_new_cell; | ||
auto d_input_gate = candidate_cell * d_new_cell; | ||
|
||
auto gates = gate_weights.chunk(3, /*dim=*/1); | ||
d_input_gate *= d_sigmoid(gates[0]); | ||
d_output_gate *= d_sigmoid(gates[1]); | ||
d_candidate_cell *= d_elu(gates[2]); | ||
|
||
auto d_gates = | ||
torch::cat({d_input_gate, d_output_gate, d_candidate_cell}, /*dim=*/1); | ||
|
||
auto d_weights = d_gates.t().mm(X); | ||
auto d_bias = d_gates.sum(/*dim=*/0, /*keepdim=*/true); | ||
|
||
auto d_X = d_gates.mm(weights); | ||
const auto state_size = grad_h.size(1); | ||
auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size); | ||
auto d_input = d_X.slice(/*dim=*/1, state_size); | ||
|
||
return {d_old_h, d_input, d_weights, d_bias, d_old_cell}; | ||
} | ||
|
||
// Registers _C as an extension module. | ||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {} | ||
|
||
// Defines the operators | ||
TORCH_LIBRARY(extension_cpp, m) { | ||
m.def("lltm_forward(Tensor input, Tensor weights, Tensor bias, Tensor old_h, Tensor old_cell) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)"); | ||
m.def("lltm_backward(Tensor grad_h, Tensor grad_cell, Tensor new_cell, Tensor input_gate, Tensor output_gate, Tensor candidate_cell, Tensor X, Tensor gate_weights, Tensor weights) -> (Tensor, Tensor, Tensor, Tensor, Tensor)"); | ||
} | ||
|
||
// Registers CPU implementations for lltm_forward, lltm_backward | ||
TORCH_LIBRARY_IMPL(extension_cpp, CPU, m) { | ||
m.impl("lltm_forward", &lltm_forward); | ||
m.impl("lltm_backward", &lltm_backward); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
#include <torch/extension.h> | ||
|
||
#include <cuda.h> | ||
#include <cuda_runtime.h> | ||
|
||
#include <vector> | ||
|
||
namespace { | ||
template <typename scalar_t> | ||
__device__ __forceinline__ scalar_t sigmoid(scalar_t z) { | ||
return 1.0 / (1.0 + exp(-z)); | ||
} | ||
|
||
template <typename scalar_t> | ||
__device__ __forceinline__ scalar_t d_sigmoid(scalar_t z) { | ||
const auto s = sigmoid(z); | ||
return (1.0 - s) * s; | ||
} | ||
|
||
template <typename scalar_t> | ||
__device__ __forceinline__ scalar_t d_tanh(scalar_t z) { | ||
const auto t = tanh(z); | ||
return 1 - (t * t); | ||
} | ||
|
||
template <typename scalar_t> | ||
__device__ __forceinline__ scalar_t elu(scalar_t z, scalar_t alpha = 1.0) { | ||
return fmaxf(0.0, z) + fminf(0.0, alpha * (exp(z) - 1.0)); | ||
} | ||
|
||
template <typename scalar_t> | ||
__device__ __forceinline__ scalar_t d_elu(scalar_t z, scalar_t alpha = 1.0) { | ||
const auto e = exp(z); | ||
const auto d_relu = z < 0.0 ? 0.0 : 1.0; | ||
return d_relu + (((alpha * (e - 1.0)) < 0.0) ? (alpha * e) : 0.0); | ||
} | ||
|
||
template <typename scalar_t> | ||
__global__ void lltm_cuda_forward_kernel( | ||
const torch::PackedTensorAccessor<scalar_t,3,torch::RestrictPtrTraits,size_t> gates, | ||
const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> old_cell, | ||
torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> new_h, | ||
torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> new_cell, | ||
torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> input_gate, | ||
torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> output_gate, | ||
torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> candidate_cell) { | ||
//batch index | ||
const int n = blockIdx.y; | ||
// column index | ||
const int c = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (c < gates.size(2)){ | ||
input_gate[n][c] = sigmoid(gates[n][0][c]); | ||
output_gate[n][c] = sigmoid(gates[n][1][c]); | ||
candidate_cell[n][c] = elu(gates[n][2][c]); | ||
new_cell[n][c] = | ||
old_cell[n][c] + candidate_cell[n][c] * input_gate[n][c]; | ||
new_h[n][c] = tanh(new_cell[n][c]) * output_gate[n][c]; | ||
} | ||
} | ||
|
||
template <typename scalar_t> | ||
__global__ void lltm_cuda_backward_kernel( | ||
torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> d_old_cell, | ||
torch::PackedTensorAccessor<scalar_t,3,torch::RestrictPtrTraits,size_t> d_gates, | ||
const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> grad_h, | ||
const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> grad_cell, | ||
const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> new_cell, | ||
const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> input_gate, | ||
const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> output_gate, | ||
const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> candidate_cell, | ||
const torch::PackedTensorAccessor<scalar_t,3,torch::RestrictPtrTraits,size_t> gate_weights) { | ||
//batch index | ||
const int n = blockIdx.y; | ||
// column index | ||
const int c = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (c < d_gates.size(2)){ | ||
const auto d_output_gate = tanh(new_cell[n][c]) * grad_h[n][c]; | ||
const auto d_tanh_new_cell = output_gate[n][c] * grad_h[n][c]; | ||
const auto d_new_cell = | ||
d_tanh(new_cell[n][c]) * d_tanh_new_cell + grad_cell[n][c]; | ||
|
||
|
||
d_old_cell[n][c] = d_new_cell; | ||
const auto d_candidate_cell = input_gate[n][c] * d_new_cell; | ||
const auto d_input_gate = candidate_cell[n][c] * d_new_cell; | ||
|
||
d_gates[n][0][c] = | ||
d_input_gate * d_sigmoid(gate_weights[n][0][c]); | ||
d_gates[n][1][c] = | ||
d_output_gate * d_sigmoid(gate_weights[n][1][c]); | ||
d_gates[n][2][c] = | ||
d_candidate_cell * d_elu(gate_weights[n][2][c]); | ||
} | ||
} | ||
} // namespace | ||
|
||
std::tuple<torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor> lltm_cuda_forward( | ||
torch::Tensor input, | ||
torch::Tensor weights, | ||
torch::Tensor bias, | ||
torch::Tensor old_h, | ||
torch::Tensor old_cell) { | ||
auto X = torch::cat({old_h, input}, /*dim=*/1); | ||
auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1)); | ||
|
||
const auto batch_size = old_cell.size(0); | ||
const auto state_size = old_cell.size(1); | ||
|
||
auto gates = gate_weights.reshape({batch_size, 3, state_size}); | ||
auto new_h = torch::zeros_like(old_cell); | ||
auto new_cell = torch::zeros_like(old_cell); | ||
auto input_gate = torch::zeros_like(old_cell); | ||
auto output_gate = torch::zeros_like(old_cell); | ||
auto candidate_cell = torch::zeros_like(old_cell); | ||
|
||
const int threads = 1024; | ||
const dim3 blocks((state_size + threads - 1) / threads, batch_size); | ||
|
||
AT_DISPATCH_FLOATING_TYPES(gates.type(), "lltm_forward_cuda", ([&] { | ||
lltm_cuda_forward_kernel<scalar_t><<<blocks, threads>>>( | ||
gates.packed_accessor<scalar_t,3,torch::RestrictPtrTraits,size_t>(), | ||
old_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
new_h.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
new_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
input_gate.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
output_gate.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
candidate_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>()); | ||
})); | ||
|
||
return {new_h, new_cell, input_gate, output_gate, candidate_cell, X, gates}; | ||
} | ||
|
||
std::tuple<torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor,torch::Tensor> lltm_cuda_backward( | ||
torch::Tensor grad_h, | ||
torch::Tensor grad_cell, | ||
torch::Tensor new_cell, | ||
torch::Tensor input_gate, | ||
torch::Tensor output_gate, | ||
torch::Tensor candidate_cell, | ||
torch::Tensor X, | ||
torch::Tensor gates, | ||
torch::Tensor weights) { | ||
auto d_old_cell = torch::zeros_like(new_cell); | ||
auto d_gates = torch::zeros_like(gates); | ||
|
||
auto grad_h_contig = grad_h.contiguous(); | ||
auto grad_cell_contig = grad_cell.contiguous(); | ||
|
||
const auto batch_size = new_cell.size(0); | ||
const auto state_size = new_cell.size(1); | ||
|
||
const int threads = 1024; | ||
const dim3 blocks((state_size + threads - 1) / threads, batch_size); | ||
|
||
AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_forward_cuda", ([&] { | ||
lltm_cuda_backward_kernel<scalar_t><<<blocks, threads>>>( | ||
d_old_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
d_gates.packed_accessor<scalar_t,3,torch::RestrictPtrTraits,size_t>(), | ||
grad_h_contig.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
grad_cell_contig.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
new_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
input_gate.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
output_gate.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
candidate_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(), | ||
gates.packed_accessor<scalar_t,3,torch::RestrictPtrTraits,size_t>()); | ||
})); | ||
|
||
auto d_gate_weights = d_gates.flatten(1, 2); | ||
auto d_weights = d_gate_weights.t().mm(X); | ||
auto d_bias = d_gate_weights.sum(/*dim=*/0, /*keepdim=*/true); | ||
|
||
auto d_X = d_gate_weights.mm(weights); | ||
auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size); | ||
auto d_input = d_X.slice(/*dim=*/1, state_size); | ||
|
||
return {d_old_h, d_input, d_weights, d_bias, d_old_cell}; | ||
} | ||
|
||
// Registers CUDA implementations for lltm_forward, lltm_backward | ||
TORCH_LIBRARY_IMPL(extension_cpp, CUDA, m) { | ||
m.impl("lltm_forward", &lltm_cuda_forward); | ||
m.impl("lltm_backward", &lltm_cuda_backward); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fix