From 3d35f8a84f985e5baa057864c1c5611eac73433b Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 23 Feb 2024 21:36:22 -0500 Subject: [PATCH 1/9] (cmake) Fix generation of targets for nvcc --- CMakeLists.txt | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b9f1854b..f6815222c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,7 @@ cmake_minimum_required(VERSION 3.22.1) project(bitsandbytes LANGUAGES CXX) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Define included source files set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp) @@ -33,7 +34,11 @@ endif() set(BNB_OUTPUT_NAME "bitsandbytes") -message(STATUS "Building with backend ${COMPUTE_BACKEND}") +message(STATUS "Configuring ${PROJECT_NAME} (${CMAKE_BUILD_TYPE})") +message(STATUS "Backend: ${COMPUTE_BACKEND}") +message(STATUS "System: ${CMAKE_SYSTEM_NAME} / ${CMAKE_SYSTEM_PROCESSOR}") +message(STATUS "CXX: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "CMake Generator: ${CMAKE_GENERATOR}") if(${COMPUTE_BACKEND} STREQUAL "cuda") if(APPLE) @@ -103,10 +108,16 @@ if(BUILD_CUDA) message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}") message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}") - foreach(capability ${COMPUTE_CAPABILITY}) - string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}") - endforeach() + # Use the "real" option to build native cubin for all selections. + # Ensure we build the PTX for the latest version. + # This is similar to the "all" and "all-major" options in CMake >= 23. + # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default + list(SORT COMPUTE_CAPABILITYY COMPARE NATURAL) + list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY) + list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES) + list(APPEND CMAKE_CUDA_ARCHITECTURES ${_LATEST_CAPABILITY}) + message(STATUS "CUDA Targets: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}") list(APPEND SRC_FILES ${CUDA_FILES}) @@ -149,7 +160,6 @@ endif() # Weird MSVC hacks if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast") endif() set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) From 9a7d1f3e2baa38ad4229f98ac09fe81f43153f98 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 23 Feb 2024 21:45:52 -0500 Subject: [PATCH 2/9] Typo --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f6815222c..103f8148b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,7 +112,7 @@ if(BUILD_CUDA) # Ensure we build the PTX for the latest version. # This is similar to the "all" and "all-major" options in CMake >= 23. # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default - list(SORT COMPUTE_CAPABILITYY COMPARE NATURAL) + list(SORT COMPUTE_CAPABILITY COMPARE NATURAL) list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY) list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES) list(APPEND CMAKE_CUDA_ARCHITECTURES ${_LATEST_CAPABILITY}) From cf53a4914e291a1f461ab055e148070bd72b283d Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 23 Feb 2024 22:06:58 -0500 Subject: [PATCH 3/9] (ci) linux + CUDA workflow: make sure we specify target architectures --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e48c25cc5..fe4c47cc2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -128,7 +128,7 @@ jobs: && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \ && cmake --build ." else - cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . + cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="50;52;60;61;70;75;80;86;89;90" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . cmake --build . --config Release fi done From 1d75906cb65b9331d8ddd848bdc87e5614260ad6 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 23 Feb 2024 22:13:26 -0500 Subject: [PATCH 4/9] fix --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index fe4c47cc2..2230a1c41 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -125,10 +125,10 @@ jobs: docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ "apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="50;52;60;61;70;75;80;86;89;90" -DNO_CUBLASLT=${NO_CUBLASLT} . \ && cmake --build ." else - cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="50;52;60;61;70;75;80;86;89;90" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . + cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . cmake --build . --config Release fi done From 126463045f6c8bd51111e094ce2db83c155dc022 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 23 Feb 2024 22:18:14 -0500 Subject: [PATCH 5/9] fix one more time --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 2230a1c41..faa30ca30 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -125,7 +125,7 @@ jobs: docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ "apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="50;52;60;61;70;75;80;86;89;90" -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"50;52;60;61;70;75;80;86;89;90\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ && cmake --build ." else cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . From 70aba795f95a102dd341e22f6e2544481114f04a Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 26 Feb 2024 11:02:52 -0500 Subject: [PATCH 6/9] (cmake) Default in CMAKE_CUDA_ARCHITECTURES_ALL when cmake<3.23, make sure we build only selected cubins and only ptx for latest capability --- CMakeLists.txt | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 103f8148b..cdc46252e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,11 +34,7 @@ endif() set(BNB_OUTPUT_NAME "bitsandbytes") -message(STATUS "Configuring ${PROJECT_NAME} (${CMAKE_BUILD_TYPE})") -message(STATUS "Backend: ${COMPUTE_BACKEND}") -message(STATUS "System: ${CMAKE_SYSTEM_NAME} / ${CMAKE_SYSTEM_PROCESSOR}") -message(STATUS "CXX: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") -message(STATUS "CMake Generator: ${CMAKE_GENERATOR}") +message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND})") if(${COMPUTE_BACKEND} STREQUAL "cuda") if(APPLE) @@ -87,6 +83,31 @@ if(BUILD_CUDA) message(FATAL_ERROR "CUDA Version > 12 is not supported") endif() + # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL. + if(CMAKE_VERSION VERSION_LESS "3.23.0") + message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") + + # 11.x and 12.x both support these at a minimum. + set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + + # CUDA 11.1 adds Ampere support for GA102-GA107. + if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.1") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86) + endif() + + # CUDA 11.4 adds Ampere support for GA10B. + if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.1") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87) + endif() + + # CUDA 11.8 adds support for Ada and Hopper. + if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) + endif() + endif() + string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") if(PTXAS_VERBOSE) # Verbose? Outputs register usage information, and other things... @@ -112,6 +133,7 @@ if(BUILD_CUDA) # Ensure we build the PTX for the latest version. # This is similar to the "all" and "all-major" options in CMake >= 23. # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default + list(REMOVE_DUPLICATES COMPUTE_CAPABILITY) list(SORT COMPUTE_CAPABILITY COMPARE NATURAL) list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY) list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES) From cde17cf3d9808ab75e6d18a421a635bde7f57aec Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 26 Feb 2024 11:23:32 -0500 Subject: [PATCH 7/9] Fix static lookup for CMAKE_CUDA_ARCHITECTURES_ALL on cmake<3.23 --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cdc46252e..0ca58e044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,17 +92,17 @@ if(BUILD_CUDA) set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) # CUDA 11.1 adds Ampere support for GA102-GA107. - if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.1") + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.1") list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86) endif() # CUDA 11.4 adds Ampere support for GA10B. - if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.1") + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.4") list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87) endif() # CUDA 11.8 adds support for Ada and Hopper. - if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.8") + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) endif() From 61992b8e1592ccea97e715e728dd81c989b73cfb Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 26 Feb 2024 16:39:37 -0500 Subject: [PATCH 8/9] Remove debug setting --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ca58e044..9a465359a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,6 @@ cmake_minimum_required(VERSION 3.22.1) project(bitsandbytes LANGUAGES CXX) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Define included source files set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp) From f603452fa9c6cab25d927910e70e7c7cf2beefad Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 26 Feb 2024 16:46:14 -0500 Subject: [PATCH 9/9] clarification --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a465359a..7f70a089e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,7 +130,8 @@ if(BUILD_CUDA) # Use the "real" option to build native cubin for all selections. # Ensure we build the PTX for the latest version. - # This is similar to the "all" and "all-major" options in CMake >= 23. + # This behavior of adding a PTX (virtual) target for the highest architecture + # is similar to how the "all" and "all-major" options would behave in CMake >= 3.23. # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default list(REMOVE_DUPLICATES COMPUTE_CAPABILITY) list(SORT COMPUTE_CAPABILITY COMPARE NATURAL)