Skip to content

Commit 908c905

Browse files
committed
Reanimate the project for A100
1 parent d057b07 commit 908c905

File tree

7 files changed

+21
-19
lines changed

7 files changed

+21
-19
lines changed

CMakeLists.txt

+3-7
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,21 @@ if (PROFILE OR CMAKE_BUILD_TYPE STREQUAL "Debug")
1111
endif()
1212
#set(CMAKE_VERBOSE_MAKEFILE on)
1313
if (NOT DEFINED CUDA_ARCH)
14-
set(CUDA_ARCH "61")
14+
set(CUDA_ARCH "80")
1515
endif()
16-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
16+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++17 ${OpenMP_CXX_FLAGS}")
1717
if (DEBUGINFO)
1818
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
1919
endif()
2020
set(SOURCE_FILES minhashcuda.cc minhashcuda.h wrappers.h private.h kernel.cu)
2121
if (NOT DISABLE_PYTHON)
2222
list(APPEND SOURCE_FILES python.cc)
23+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${NUMPY}")
2324
endif()
2425
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
2526
set(NVCC_FLAGS "-G -g")
2627
endif()
2728
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CUDA_ARCH} -Xptxas=-v -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
28-
if (CMAKE_MAJOR_VERSION LESS 4 AND CMAKE_MINOR_VERSION LESS 3)
29-
# workaround https://github.com/Kitware/CMake/commit/99abebdea01b9ef73e091db5594553f7b1694a1b
30-
message(STATUS "Applied CUDA C++11 workaround on CMake < 3.3")
31-
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std c++11")
32-
endif()
3329
cuda_add_library(MHCUDA SHARED ${SOURCE_FILES} OPTIONS ${NVCC_FLAGS})
3430
target_link_libraries(MHCUDA ${CUDA_curand_LIBRARY})
3531
if(PYTHONLIBS_FOUND)

kernel.cu

+4-4
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ __global__ void weighted_minhash_cuda(
7070
const uint32_t sample_offset = sample_index * sample_delta;
7171
const uint32_t samples = blockDim.x * sample_delta;
7272
extern __shared__ float shmem[];
73-
float *volatile lnmins = &shmem[(threadIdx.y * blockDim.x + sample_index) * 3 * sample_delta];
74-
uint2 *volatile dtmins = reinterpret_cast<uint2 *>(lnmins + sample_delta);
73+
float *lnmins = &shmem[(threadIdx.y * blockDim.x + sample_index) * 3 * sample_delta];
74+
uint2 *dtmins = reinterpret_cast<uint2 *>(lnmins + sample_delta);
7575
int32_t row = -1;
7676
for (uint32_t index = 0, border = 0;; index++) {
7777
if (index >= border) {
@@ -94,7 +94,7 @@ __global__ void weighted_minhash_cuda(
9494
}
9595
const float w = logf(weights[index - device_wc_offset]);
9696
const uint32_t d = cols[index - device_wc_offset];
97-
volatile int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
97+
int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
9898
#pragma unroll 4
9999
for (int s = 0; s < sample_delta; s++, ci += d_dim) {
100100
// We apply the logarithm trick here: log (a / z) = log a - log z
@@ -172,4 +172,4 @@ MHCUDAResult weighted_minhash(
172172
return mhcudaSuccess;
173173
}
174174

175-
} // extern "C"
175+
} // extern "C"

minhashcuda.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {
4545
if (devices == 0) {
4646
cudaGetDeviceCount(reinterpret_cast<int *>(&devices));
4747
if (devices == 0) {
48-
return std::move(devs);
48+
return devs;
4949
}
5050
devices = (1u << devices) - 1;
5151
}
@@ -106,7 +106,7 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {
106106
}
107107
}
108108
}
109-
return std::move(devs);
109+
return devs;
110110
}
111111

112112
static MHCUDAResult print_memory_stats(const std::vector<int> &devs) {

private.h

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include "minhashcuda.h"
55
#include <cmath>
6+
#include <cstdio>
67
#include <tuple>
78
#include "wrappers.h"
89

python.cc

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <functional>
12
#include <memory>
23
#include <unordered_map>
34
#include <Python.h>

setup.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
import sys
99
import sysconfig
1010

11+
import numpy
12+
13+
1114
with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
1215
long_description = f.read()
1316

@@ -44,9 +47,11 @@ def get_outputs(self, *args, **kwargs):
4447
def _build(self, builddir=None):
4548
syspaths = sysconfig.get_paths()
4649
check_call(("cmake", "-DCMAKE_BUILD_TYPE=Release",
47-
"-DCUDA_TOOLKIT_ROOT_DIR=%s" % os.getenv(
50+
"-DCUDA_ARCH=" + os.getenv("CUDA_ARCH", "80"),
51+
"-DNUMPY=" + numpy.get_include(),
52+
"-DCUDA_TOOLKIT_ROOT_DIR=" + os.getenv(
4853
"CUDA_TOOLKIT_ROOT_DIR",
49-
"must_export_CUDA_TOOLKIT_ROOT_DIR"),
54+
"/usr/local/cuda"),
5055
"-DPYTHON_DEFAULT_EXECUTABLE=python3",
5156
"-DPYTHON_INCLUDE_DIRS=" + syspaths["include"],
5257
"-DPYTHON_EXECUTABLE=" + sys.executable,
@@ -73,7 +78,7 @@ def is_pure(self):
7378
description="Accelerated Weighted MinHash-ing on GPU",
7479
long_description=long_description,
7580
long_description_content_type="text/markdown",
76-
version="2.1.1",
81+
version="2.2.0",
7782
license="Apache Software License",
7883
author="Vadim Markovtsev",
7984
author_email="[email protected]",
@@ -89,9 +94,7 @@ def is_pure(self):
8994
"License :: OSI Approved :: Apache Software License",
9095
"Operating System :: POSIX :: Linux",
9196
"Topic :: Scientific/Engineering :: Information Analysis",
92-
"Programming Language :: Python :: 3.4",
93-
"Programming Language :: Python :: 3.5",
94-
"Programming Language :: Python :: 3.6"
97+
"Programming Language :: Python :: 3.10"
9598
]
9699
)
97100

wrappers.h

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define MHCUDA_WRAPPERS_H
33

44
#include <cuda_runtime_api.h>
5+
#include <functional>
56
#include <memory>
67
#include <vector>
78

0 commit comments

Comments
 (0)