From 3d35f8a84f985e5baa057864c1c5611eac73433b Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 Feb 2024 21:36:22 -0500
Subject: [PATCH 1/9] (cmake) Fix generation of targets for nvcc

---
 CMakeLists.txt | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b9f1854b..f6815222c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,7 @@
 cmake_minimum_required(VERSION 3.22.1)
 
 project(bitsandbytes LANGUAGES CXX)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Define included source files
 set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
@@ -33,7 +34,11 @@ endif()
 
 set(BNB_OUTPUT_NAME "bitsandbytes")
 
-message(STATUS "Building with backend ${COMPUTE_BACKEND}")
+message(STATUS "Configuring ${PROJECT_NAME} (${CMAKE_BUILD_TYPE})")
+message(STATUS "Backend: ${COMPUTE_BACKEND}")
+message(STATUS "System: ${CMAKE_SYSTEM_NAME} / ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "CXX: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+message(STATUS "CMake Generator: ${CMAKE_GENERATOR}")
 
 if(${COMPUTE_BACKEND} STREQUAL "cuda")
     if(APPLE)
@@ -103,10 +108,16 @@ if(BUILD_CUDA)
     message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}")
     message(STATUS "CUDA Capabilities  Selected: ${COMPUTE_CAPABILITY}")
 
-    foreach(capability ${COMPUTE_CAPABILITY})
-        string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}")
-    endforeach()
+    # Use the "real" option to build native cubin for all selections.
+    # Ensure we build the PTX for the latest version.
+    # This is similar to the "all" and "all-major" options in CMake >= 23.
+    # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default
+    list(SORT COMPUTE_CAPABILITYY COMPARE NATURAL)
+    list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY)
+    list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES)
+    list(APPEND CMAKE_CUDA_ARCHITECTURES ${_LATEST_CAPABILITY})
 
+    message(STATUS "CUDA Targets: ${CMAKE_CUDA_ARCHITECTURES}")
     message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}")
 
     list(APPEND SRC_FILES ${CUDA_FILES})
@@ -149,7 +160,6 @@ endif()
 # Weird MSVC hacks
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast")
 endif()
 
 set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)

From 9a7d1f3e2baa38ad4229f98ac09fe81f43153f98 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 Feb 2024 21:45:52 -0500
Subject: [PATCH 2/9] Typo

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6815222c..103f8148b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,7 +112,7 @@ if(BUILD_CUDA)
     # Ensure we build the PTX for the latest version.
     # This is similar to the "all" and "all-major" options in CMake >= 23.
     # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default
-    list(SORT COMPUTE_CAPABILITYY COMPARE NATURAL)
+    list(SORT COMPUTE_CAPABILITY COMPARE NATURAL)
     list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY)
     list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES)
     list(APPEND CMAKE_CUDA_ARCHITECTURES ${_LATEST_CAPABILITY})

From cf53a4914e291a1f461ab055e148070bd72b283d Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 Feb 2024 22:06:58 -0500
Subject: [PATCH 3/9] (ci) linux + CUDA workflow: make sure we specify target
 architectures

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index e48c25cc5..fe4c47cc2 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -128,7 +128,7 @@ jobs:
               && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \
               && cmake --build ."
           else
-            cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
+            cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="50;52;60;61;70;75;80;86;89;90" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
             cmake --build . --config Release
           fi
         done

From 1d75906cb65b9331d8ddd848bdc87e5614260ad6 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 Feb 2024 22:13:26 -0500
Subject: [PATCH 4/9] fix

---
 .github/workflows/python-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index fe4c47cc2..2230a1c41 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -125,10 +125,10 @@ jobs:
             docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
               "apt-get update \
               && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-              && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \
+              && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="50;52;60;61;70;75;80;86;89;90" -DNO_CUBLASLT=${NO_CUBLASLT} . \
               && cmake --build ."
           else
-            cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="50;52;60;61;70;75;80;86;89;90" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
+            cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
             cmake --build . --config Release
           fi
         done

From 126463045f6c8bd51111e094ce2db83c155dc022 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 Feb 2024 22:18:14 -0500
Subject: [PATCH 5/9] fix one more time

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 2230a1c41..faa30ca30 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -125,7 +125,7 @@ jobs:
             docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
               "apt-get update \
               && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-              && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="50;52;60;61;70;75;80;86;89;90" -DNO_CUBLASLT=${NO_CUBLASLT} . \
+              && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"50;52;60;61;70;75;80;86;89;90\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
               && cmake --build ."
           else
             cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .

From 70aba795f95a102dd341e22f6e2544481114f04a Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:02:52 -0500
Subject: [PATCH 6/9] (cmake) Default in CMAKE_CUDA_ARCHITECTURES_ALL when
 cmake<3.23, make sure we build only selected cubins and only ptx for latest
 capability

---
 CMakeLists.txt | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 103f8148b..cdc46252e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,11 +34,7 @@ endif()
 
 set(BNB_OUTPUT_NAME "bitsandbytes")
 
-message(STATUS "Configuring ${PROJECT_NAME} (${CMAKE_BUILD_TYPE})")
-message(STATUS "Backend: ${COMPUTE_BACKEND}")
-message(STATUS "System: ${CMAKE_SYSTEM_NAME} / ${CMAKE_SYSTEM_PROCESSOR}")
-message(STATUS "CXX: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
-message(STATUS "CMake Generator: ${CMAKE_GENERATOR}")
+message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND})")
 
 if(${COMPUTE_BACKEND} STREQUAL "cuda")
     if(APPLE)
@@ -87,6 +83,31 @@ if(BUILD_CUDA)
         message(FATAL_ERROR "CUDA Version > 12 is not supported")
     endif()
 
+    # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL.
+    if(CMAKE_VERSION VERSION_LESS "3.23.0")
+        message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")
+
+        # 11.x and 12.x both support these at a minimum.
+        set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80)
+        set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)
+
+        # CUDA 11.1 adds Ampere support for GA102-GA107.
+        if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.1")
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86)
+        endif()
+
+        # CUDA 11.4 adds Ampere support for GA10B.
+        if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.1")
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87)
+        endif()
+
+        # CUDA 11.8 adds support for Ada and Hopper.
+        if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.8")
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90)
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90)
+        endif()
+    endif()
+
     string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")
     if(PTXAS_VERBOSE)
         # Verbose? Outputs register usage information, and other things...
@@ -112,6 +133,7 @@ if(BUILD_CUDA)
     # Ensure we build the PTX for the latest version.
     # This is similar to the "all" and "all-major" options in CMake >= 23.
     # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default
+    list(REMOVE_DUPLICATES COMPUTE_CAPABILITY)
     list(SORT COMPUTE_CAPABILITY COMPARE NATURAL)
     list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY)
     list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES)

From cde17cf3d9808ab75e6d18a421a635bde7f57aec Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:23:32 -0500
Subject: [PATCH 7/9] Fix static lookup for CMAKE_CUDA_ARCHITECTURES_ALL on
 cmake<3.23

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cdc46252e..0ca58e044 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,17 +92,17 @@ if(BUILD_CUDA)
         set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)
 
         # CUDA 11.1 adds Ampere support for GA102-GA107.
-        if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.1")
+        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.1")
             list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86)
         endif()
 
         # CUDA 11.4 adds Ampere support for GA10B.
-        if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.1")
+        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.4")
             list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87)
         endif()
 
         # CUDA 11.8 adds support for Ada and Hopper.
-        if (CMAKE_CUDA_COMPILER_TOOLKIT_VERSION VERSION_GREATER_EQUAL "11.8")
+        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
             list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90)
             list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90)
         endif()

From 61992b8e1592ccea97e715e728dd81c989b73cfb Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 26 Feb 2024 16:39:37 -0500
Subject: [PATCH 8/9] Remove debug setting

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ca58e044..9a465359a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,6 @@
 cmake_minimum_required(VERSION 3.22.1)
 
 project(bitsandbytes LANGUAGES CXX)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Define included source files
 set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)

From f603452fa9c6cab25d927910e70e7c7cf2beefad Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 26 Feb 2024 16:46:14 -0500
Subject: [PATCH 9/9] clarification

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9a465359a..7f70a089e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,7 +130,8 @@ if(BUILD_CUDA)
 
     # Use the "real" option to build native cubin for all selections.
     # Ensure we build the PTX for the latest version.
-    # This is similar to the "all" and "all-major" options in CMake >= 23.
+    # This behavior of adding a PTX (virtual) target for the highest architecture
+    # is similar to how the "all" and "all-major" options would behave in CMake >= 3.23.
     # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default
     list(REMOVE_DUPLICATES COMPUTE_CAPABILITY)
     list(SORT COMPUTE_CAPABILITY COMPARE NATURAL)