Reanimate the project for A100

vmarkovtsev · vmarkovtsev · commit 908c90553365 · 2023-09-20T18:52:11.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,25 +11,21 @@ if (PROFILE OR CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 #set(CMAKE_VERBOSE_MAKEFILE on)
 if (NOT DEFINED CUDA_ARCH)
-  set(CUDA_ARCH "61")
+  set(CUDA_ARCH "80")
 endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++17 ${OpenMP_CXX_FLAGS}")
 if (DEBUGINFO)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
 endif()
 set(SOURCE_FILES minhashcuda.cc minhashcuda.h wrappers.h private.h kernel.cu)
 if (NOT DISABLE_PYTHON)
   list(APPEND SOURCE_FILES python.cc)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${NUMPY}")
 endif()
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
   set(NVCC_FLAGS "-G -g")
 endif()
 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CUDA_ARCH} -Xptxas=-v -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
-if (CMAKE_MAJOR_VERSION LESS 4 AND CMAKE_MINOR_VERSION LESS 3)
-  # workaround https://github.com/Kitware/CMake/commit/99abebdea01b9ef73e091db5594553f7b1694a1b
-  message(STATUS "Applied CUDA C++11 workaround on CMake < 3.3")
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std c++11")
-endif()
 cuda_add_library(MHCUDA SHARED ${SOURCE_FILES} OPTIONS ${NVCC_FLAGS})
 target_link_libraries(MHCUDA ${CUDA_curand_LIBRARY})
 if(PYTHONLIBS_FOUND)
diff --git a/kernel.cu b/kernel.cu
@@ -70,8 +70,8 @@ __global__ void weighted_minhash_cuda(
   const uint32_t sample_offset = sample_index * sample_delta;
   const uint32_t samples = blockDim.x * sample_delta;
   extern __shared__ float shmem[];
-  float *volatile lnmins = &shmem[(threadIdx.y * blockDim.x + sample_index) * 3 * sample_delta];
-  uint2 *volatile dtmins = reinterpret_cast<uint2 *>(lnmins + sample_delta);
+  float *lnmins = &shmem[(threadIdx.y * blockDim.x + sample_index) * 3 * sample_delta];
+  uint2 *dtmins = reinterpret_cast<uint2 *>(lnmins + sample_delta);
   int32_t row = -1;
   for (uint32_t index = 0, border = 0;; index++) {
     if (index >= border) {
@@ -94,7 +94,7 @@ __global__ void weighted_minhash_cuda(
     }
     const float w = logf(weights[index - device_wc_offset]);
     const uint32_t d = cols[index - device_wc_offset];
-    volatile int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
+    int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
     #pragma unroll 4
     for (int s = 0; s < sample_delta; s++, ci += d_dim) {
       // We apply the logarithm trick here: log (a / z) = log a - log z
@@ -172,4 +172,4 @@ MHCUDAResult weighted_minhash(
   return mhcudaSuccess;
 }
 
-}  // extern "C"
+}  // extern "C"
diff --git a/minhashcuda.cc b/minhashcuda.cc
@@ -45,7 +45,7 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {
   if (devices == 0) {
     cudaGetDeviceCount(reinterpret_cast<int *>(&devices));
     if (devices == 0) {
-      return std::move(devs);
+      return devs;
     }
     devices = (1u << devices) - 1;
   }
@@ -106,7 +106,7 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {
       }
     }
   }
-  return std::move(devs);
+  return devs;
 }
 
 static MHCUDAResult print_memory_stats(const std::vector<int> &devs) {
diff --git a/private.h b/private.h
@@ -3,6 +3,7 @@
 
 #include "minhashcuda.h"
 #include <cmath>
+#include <cstdio>
 #include <tuple>
 #include "wrappers.h"
 
diff --git a/python.cc b/python.cc
@@ -1,3 +1,4 @@
+#include <functional>
 #include <memory>
 #include <unordered_map>
 #include <Python.h>
diff --git a/setup.py b/setup.py
@@ -8,6 +8,9 @@
 import sys
 import sysconfig
 
+import numpy
+
+
 with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
     long_description = f.read()
 
@@ -44,9 +47,11 @@ def get_outputs(self, *args, **kwargs):
     def _build(self, builddir=None):
         syspaths = sysconfig.get_paths()
         check_call(("cmake", "-DCMAKE_BUILD_TYPE=Release",
-                    "-DCUDA_TOOLKIT_ROOT_DIR=%s" % os.getenv(
+                    "-DCUDA_ARCH=" + os.getenv("CUDA_ARCH", "80"),
+                    "-DNUMPY=" + numpy.get_include(),
+                    "-DCUDA_TOOLKIT_ROOT_DIR=" + os.getenv(
                         "CUDA_TOOLKIT_ROOT_DIR",
-                        "must_export_CUDA_TOOLKIT_ROOT_DIR"),
+                        "/usr/local/cuda"),
                     "-DPYTHON_DEFAULT_EXECUTABLE=python3",
                     "-DPYTHON_INCLUDE_DIRS=" + syspaths["include"],
                     "-DPYTHON_EXECUTABLE=" + sys.executable,
@@ -73,7 +78,7 @@ def is_pure(self):
     description="Accelerated Weighted MinHash-ing on GPU",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="2.1.1",
+    version="2.2.0",
     license="Apache Software License",
     author="Vadim Markovtsev",
     author_email="vadim@sourced.tech",
@@ -89,9 +94,7 @@ def is_pure(self):
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: POSIX :: Linux",
         "Topic :: Scientific/Engineering :: Information Analysis",
-        "Programming Language :: Python :: 3.4",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6"
+        "Programming Language :: Python :: 3.10"
     ]
 )
 
diff --git a/wrappers.h b/wrappers.h
@@ -2,6 +2,7 @@
 #define MHCUDA_WRAPPERS_H
 
 #include <cuda_runtime_api.h>
+#include <functional>
 #include <memory>
 #include <vector>
 

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {`
`45`	`45`	`if (devices == 0) {`
`46`	`46`	`cudaGetDeviceCount(reinterpret_cast<int *>(&devices));`
`47`	`47`	`if (devices == 0) {`
`48`		`- return std::move(devs);`
	`48`	`+ return devs;`
`49`	`49`	`}`
`50`	`50`	`devices = (1u << devices) - 1;`
`51`	`51`	`}`
`@@ -106,7 +106,7 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {`
`106`	`106`	`}`
`107`	`107`	`}`
`108`	`108`	`}`
`109`		`- return std::move(devs);`
	`109`	`+ return devs;`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`static MHCUDAResult print_memory_stats(const std::vector<int> &devs) {`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+#include <functional>`
`1`	`2`	`#include <memory>`
`2`	`3`	`#include <unordered_map>`
`3`	`4`	`#include <Python.h>`