-
Notifications
You must be signed in to change notification settings - Fork 698
Make native code portable and add GitHub workflow for building #949
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 31 commits
b263fc6
de10f7e
0c62fa6
aae5ff7
b06590d
03744cb
648e2f5
cba2b1a
6f70a5e
90fa8b1
c815ca0
36b1ef2
44e3f17
6fe8d0c
572225e
7a8676e
771f6db
8dd8d63
8b1ceb7
e11867b
57625db
ada2e9a
e0093e9
23bdf05
87414c3
0ee8f7f
1f35064
816eee0
0528324
8152e21
9aad25a
bcc6780
2951e2c
9867392
b773dfb
45ad394
e2e4874
59a1000
41ddd25
b5b6151
ca5f14a
cd8343e
2456cf3
b460125
7a605e1
28188a5
86b2bd6
01c3f59
e4344b0
e2e8f87
2ba8be3
7ed6a01
eba19a8
3288a0f
fdddb11
b7503c9
fb642a5
2730dd9
2e3a1d8
590a27a
927f716
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
version: 2 | ||
updates: | ||
- package-ecosystem: pip | ||
directory: "/" | ||
schedule: | ||
interval: "weekly" | ||
groups: | ||
major: | ||
update-types: [major] | ||
minor-patch: | ||
update-types: [minor, patch] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
name: Python package | ||
|
||
on: | ||
push: {} | ||
pull_request: | ||
branches: [ main ] | ||
release: | ||
types: [ published ] | ||
|
||
jobs: | ||
|
||
## | ||
# This job matrix builds the non-CUDA versions of the libraries for all supported platforms. | ||
## | ||
build-shared-libs: | ||
strategy: | ||
matrix: | ||
os: [ubuntu-latest, macos-latest, windows-latest] | ||
arch: [x86_64, aarch64] | ||
exclude: | ||
- os: windows-latest # This probably requres arm64 Windows agents | ||
arch: aarch64 | ||
runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents | ||
steps: | ||
# Check out code | ||
- uses: actions/checkout@v4 | ||
# On Linux we use CMake within Docker | ||
- name: Setup cmake | ||
uses: jwlawson/[email protected] | ||
with: | ||
cmake-version: '3.26.x' | ||
- name: Add msbuild to PATH | ||
uses: microsoft/[email protected] | ||
if: ${{ startsWith(matrix.os, 'windows') }} | ||
# Check out dependencies code | ||
- uses: actions/checkout@v4 | ||
name: Check out NVidia cub | ||
with: | ||
repository: nvidia/cub | ||
ref: 1.11.0 | ||
path: dependencies/cub | ||
# Compile C++ code | ||
- name: Build C++ | ||
shell: bash | ||
run: | | ||
set -ex | ||
build_os=${{ matrix.os }} | ||
build_arch=${{ matrix.arch }} | ||
if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then | ||
# Allow cross-compile om aarch64 | ||
sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu | ||
fi | ||
if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then | ||
cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF . | ||
else | ||
cmake -DENABLE_CUDA=OFF . | ||
fi | ||
if [ ${build_os:0:7} == windows ]; then | ||
pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" | ||
else | ||
make | ||
fi | ||
mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} | ||
( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) | ||
- name: Upload build artifact | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: shared_library_${{ matrix.os }}_${{ matrix.arch }} | ||
path: output/* | ||
retention-days: 7 | ||
## | ||
# This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) | ||
## | ||
build-shared-libs-cuda: | ||
strategy: | ||
matrix: | ||
os: [ubuntu-latest, windows-latest] | ||
arch: [x86_64, aarch64] | ||
cuda_version: ['12.1.0'] | ||
exclude: | ||
- os: windows-latest # This probably requres arm64 Windows agents | ||
arch: aarch64 | ||
runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents | ||
steps: | ||
# Check out code | ||
- uses: actions/checkout@v4 | ||
# Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) | ||
- name: Set up Docker multiarch | ||
if: startsWith(matrix.os, 'ubuntu') | ||
uses: docker/setup-qemu-action@v2 | ||
# On Linux we use CMake within Docker | ||
- name: Setup cmake | ||
if: ${{ !startsWith(matrix.os, 'linux') }} | ||
uses: jwlawson/[email protected] | ||
with: | ||
cmake-version: '3.26.x' | ||
# Windows: We install Cuda on the agent (slow) | ||
- uses: Jimver/[email protected] | ||
if: startsWith(matrix.os, 'windows') | ||
id: cuda-toolkit | ||
with: | ||
cuda: ${{ matrix.cuda_version }} | ||
method: 'local' | ||
# sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' | ||
- name: Add msbuild to PATH | ||
uses: microsoft/[email protected] | ||
if: ${{ startsWith(matrix.os, 'windows') }} | ||
# Check out dependencies code | ||
- uses: actions/checkout@v4 | ||
name: Check out NVidia cub | ||
with: | ||
repository: nvidia/cub | ||
ref: 1.11.0 | ||
path: dependencies/cub | ||
# Compile C++ code | ||
- name: Build C++ | ||
shell: bash | ||
run: | | ||
set -ex | ||
build_os=${{ matrix.os }} | ||
build_arch=${{ matrix.arch }} | ||
if [ ${build_os:0:6} == ubuntu ]; then | ||
image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 | ||
echo "Using image $image" | ||
docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ | ||
"apt-get update \ | ||
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ | ||
&& cmake -DENABLE_CUDA=ON . \ | ||
&& make" | ||
else | ||
cmake -DENABLE_CUDA=ON . | ||
pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" | ||
fi | ||
mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} | ||
( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) | ||
- name: Upload build artifact | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }} | ||
path: output/* | ||
retention-days: 7 | ||
build-wheels: | ||
needs: | ||
- build-shared-libs | ||
- build-shared-libs-cuda | ||
strategy: | ||
matrix: | ||
os: [ubuntu-latest, macos-latest, windows-latest] | ||
python-version: ["3.9", "3.10", "3.11", "3.12"] | ||
arch: [x86_64, aarch64] | ||
exclude: | ||
- os: windows-latest # This probably requres arm64 Windows agents | ||
arch: aarch64 | ||
runs-on: ${{ matrix.os }} | ||
steps: | ||
# Check out code | ||
- uses: actions/checkout@v4 | ||
# Download shared libraries | ||
- name: Download build artifact | ||
uses: actions/download-artifact@v4 | ||
with: | ||
merge-multiple: true | ||
pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}" | ||
path: output/ | ||
- name: Copy correct platform shared library | ||
shell: bash | ||
run: | | ||
ls -lR output/ | ||
cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ | ||
# Set up the Python version needed | ||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
cache: pip | ||
- name: Install build package | ||
shell: bash | ||
run: pip install build | ||
- name: Install Python test dependencies | ||
shell: bash | ||
run: pip install -r requirements-ci.txt | ||
# TODO: How to run CUDA tests on GitHub actions? | ||
#- name: Run unit tests | ||
# if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents | ||
# run: | | ||
# PYTHONPATH=. pytest --log-cli-level=DEBUG tests | ||
- name: Build wheel | ||
shell: bash | ||
run: python -m build . | ||
- name: Upload build artifact | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} | ||
path: dist/bitsandbytes-*.whl | ||
retention-days: 7 | ||
publish: | ||
needs: build-wheels | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Download build artifact | ||
uses: actions/download-artifact@v4 | ||
with: | ||
path: dist/ | ||
merge-multiple: true | ||
pattern: "bdist_wheel_*" | ||
- run: | | ||
ls -lR dist/ | ||
- name: Publish to PyPi | ||
if: startsWith(github.ref, 'refs/tags') | ||
uses: pypa/gh-action-pypi-publish@release/v1 | ||
with: | ||
password: ${{ secrets.pypi }} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
cmake_minimum_required(VERSION 3.22.1) | ||
|
||
option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF) | ||
option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF) | ||
|
||
if(ENABLE_CUDA) | ||
if(APPLE) | ||
message(FATAL_ERROR "CUDA is not supported on macOS" ) | ||
endif() | ||
option(NO_CUBLASLT "Don't use CUBLAST" OFF) | ||
if(NO_CUBLASLT) | ||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72) | ||
else() | ||
set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90) | ||
endif() | ||
endif() | ||
rickardp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if(ENABLE_CUDA) | ||
message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}") | ||
# Find CUDA tools if we are compiling with CUDA | ||
find_package(CUDAToolkit REQUIRED) | ||
if(NO_CUBLASLT) | ||
set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt") | ||
else() | ||
set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}") | ||
endif() | ||
|
||
project(bitsandbytes LANGUAGES CXX CUDA) | ||
add_compile_definitions(BUILD_CUDA) | ||
set(CMAKE_CUDA_STANDARD 14) | ||
set(CMAKE_CUDA_STANDARD_REQUIRED ON) | ||
set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu) | ||
elseif(ENABLE_MPS) | ||
if(NOT APPLE) | ||
message(FATAL_ERROR "MPS is only supported on macOS" ) | ||
endif() | ||
message("Building MPS support") | ||
set(LIBSUFFIX "mps") | ||
project(bitsandbytes LANGUAGES CXX OBJCXX) | ||
add_compile_definitions(BUILD_MPS) | ||
set(METAL_SOURCES csrc/mps_kernels.metal) | ||
file(MAKE_DIRECTORY "build") | ||
add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" | ||
COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES} | ||
COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" | ||
DEPENDS "${METAL_SOURCES}" | ||
COMMENT "Compiling Metal kernels" | ||
VERBATIM) | ||
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") | ||
set(GPU_SOURCES csrc/mps_ops.mm) | ||
else() | ||
message("Building with CPU only") | ||
set(LIBSUFFIX "cpu") | ||
|
||
project(bitsandbytes LANGUAGES CXX) | ||
set(GPU_SOURCES) | ||
endif() | ||
|
||
if(APPLE) | ||
set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is also user facing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code fails to compile on a lower target. For my understanding, why would you want to change this without chaning source code? |
||
endif() | ||
set(CMAKE_CXX_STANDARD 14) | ||
set(CXX_STANDARD_REQUIRED C++14) | ||
rickardp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if(WIN32) | ||
# Mute warnings | ||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177") | ||
|
||
# Enable fast math on VC++ | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast") | ||
rickardp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Export all symbols | ||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) | ||
endif() | ||
|
||
# Weird MSVC hacks | ||
if(MSVC) | ||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT") | ||
rickardp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") | ||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you using AVX2 intrinsic? Otherwise this is also a user decision. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Celeron does not support AVX2 for example There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would also override user settings of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The source code uses AVX2 intrinsics. "user" here is developer, i.e. you need to change source code to make use of AVX512. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That is incorrect. MSVC can compile AVX2 intrinsic without the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, sorry if I was unclear. I just meant that since the AVX2 intrinsics is part of the source code, does it make sense to expose this to the end user compiling from source? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To clarify, I don't mind removing this or making it overridable. I just want to understand how to design the replacement :) For example, should we add a SIMD option that would allow the end user to switch between scalar, AVX2, AVX512 and (on arm ) Neon? |
||
endif() | ||
|
||
# Add csrc files | ||
add_library(bitsandbytes SHARED | ||
${GPU_SOURCES} | ||
csrc/common.cpp | ||
csrc/cpu_ops.cpp | ||
csrc/pythonInterface.cpp) | ||
|
||
target_include_directories(bitsandbytes PUBLIC | ||
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} | ||
${CMAKE_CURRENT_SOURCE_DIR}/csrc | ||
${CMAKE_CURRENT_SOURCE_DIR}/include) | ||
|
||
if(ENABLE_CUDA) | ||
target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include) | ||
|
||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math") | ||
rickardp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
set_target_properties( | ||
bitsandbytes | ||
PROPERTIES | ||
CUDA_SEPARABLE_COMPILATION ON) | ||
|
||
target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse) | ||
endif() | ||
if(ENABLE_MPS) | ||
add_dependencies(bitsandbytes metallib) | ||
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph") | ||
endif() | ||
|
||
set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}") | ||
# Set the output name of the CUDA library | ||
if(MSVC) | ||
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) | ||
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) | ||
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) | ||
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) | ||
endif() | ||
|
||
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) |
Uh oh!
There was an error while loading. Please reload this page.