Skip to content

Commit 955d249

Browse files
committed
Merge branch 'develop'
2 parents 4bae74e + 5d472f6 commit 955d249

9 files changed

+213
-93
lines changed

CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,17 @@ if (NOT DEFINED CUDA_ARCH)
1414
set(CUDA_ARCH "61")
1515
endif()
1616
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
17+
if (DEBUGINFO)
18+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
19+
endif()
1720
set(SOURCE_FILES minhashcuda.cc minhashcuda.h wrappers.h private.h kernel.cu)
1821
if (NOT DISABLE_PYTHON)
1922
list(APPEND SOURCE_FILES python.cc)
2023
endif()
2124
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
2225
set(NVCC_FLAGS "-G -g")
2326
endif()
24-
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=compute_${CUDA_ARCH} -Xptxas=-v -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
27+
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CUDA_ARCH} -Xptxas=-v -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
2528
if (CMAKE_MAJOR_VERSION LESS 4 AND CMAKE_MINOR_VERSION LESS 3)
2629
# workaround https://github.com/Kitware/CMake/commit/99abebdea01b9ef73e091db5594553f7b1694a1b
2730
message(STATUS "Applied CUDA C++11 workaround on CMake < 3.3")

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ del data
7777
# Initialize the hasher aka "generator" with 128 hash samples for every row
7878
gen = libMHCUDA.minhash_cuda_init(m.shape[-1], 128, seed=1, verbosity=1)
7979

80-
# Calculate thr hashes. Can be executed several times with different number of rows
80+
# Calculate the hashes. Can be executed several times with different number of rows
8181
hashes = libMHCUDA.minhash_cuda_calc(gen, m)
8282

8383
# Free the resources
@@ -198,3 +198,5 @@ Generator pointer is invalidated.
198198
License
199199
-------
200200
MIT license.
201+
202+
#### README {#ignore_this_doxygen_anchor}

doc/Doxyfile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
INPUT = ../
2+
FILE_PATTERNS = *.h *.cc *.cu *.md
3+
EXTENSION_MAPPING = cu=C++
4+
EXTRACT_ALL = YES
5+
EXTRACT_ANON_NSPACES = YES
6+
EXCLUDE_PATTERNS = *.py
7+
DOXYFILE_ENCODING = UTF-8
8+
PROJECT_NAME = MinHashCUDA
9+
OUTPUT_LANGUAGE = English
10+
GENERATE_XML = NO
11+
GENERATE_LATEX = NO
12+
GENERATE_HTML = YES
13+
HTML_OUTPUT = doxyhtml/

kernel.cu

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,31 @@
22
#include <cfloat>
33
#include "private.h"
44

5+
#define MAX_BLOCK_SIZE 1024
6+
7+
/// The number of dimensions. Constant on every device.
58
__constant__ uint32_t d_dim;
69

7-
__global__ void gamma_cuda(uint32_t size, const float *__restrict__ v1, float *v2) {
10+
/// Calculates the gamma distribution of the specified size from two uniform
11+
/// distributions.
12+
/// @param size The number of samples to write.
13+
/// @param v1 in The first array with uniformly distributed values in [0, 1].
14+
/// @param v2 in,out The second array with uniformly distributed values in [0, 1].
15+
/// The output is written to it.
16+
/// @note v1 and v2 must be independent (e.g., not the same), otherwise you will
17+
/// get an invalid result.
18+
__global__ void gamma_cuda(uint32_t size, const float *__restrict__ v1,
19+
float *__restrict__ v2) {
820
uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
921
if (index >= size) {
1022
return;
1123
}
1224
v2[index] = -logf(v1[index] * v2[index]);
1325
}
1426

27+
/// Calculates the natural logarithm of the array.
28+
/// @param size The length of the array.
29+
/// @param v in,out The array to read and write.
1530
__global__ void log_cuda(uint32_t size, float *v) {
1631
uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
1732
if (index >= size) {
@@ -20,11 +35,24 @@ __global__ void log_cuda(uint32_t size, float *v) {
2035
v[index] = logf(v[index]);
2136
}
2237

23-
/*
24-
weights, cols, rows - CSR format
25-
plan - execution plan, consists of 2 parts: first is offset table and
26-
the second is the row indices
27-
*/
38+
/// Weighted MinHash kernel. The argument names follow the paper:
39+
/// http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf
40+
/// @param rs Gamma(2,1)-random samples. The length must be the product of
41+
/// number of processed samples (vectors) by the number of dimensions.
42+
/// @param ln_cs Logarithm over the gamma(2,1) distribution. Same length as rs.
43+
/// @param betas Uniformly [0, 1] distributed samples. Same length as rs.
44+
/// @param weights CSR's data.
45+
/// @param cols CSR's indices.
46+
/// @param rows CSR's indptrs.
47+
/// @param plan Execution plan, consists of 2 parts: the first is the offset
48+
/// table and the second is the row indices
49+
/// @param sample_delta How many hashes to process in a single thread. Depends
50+
/// on the shared memory size.
51+
/// @param device_row_offset Shard offset in rows. Specific to every device.
52+
/// @param device_wc_offset Shard offset in weights and cols. Specific to every
53+
/// device.
54+
/// @param hashes The output of size number of vectors x number of hashes for
55+
/// each x 2.
2856
__global__ void weighted_minhash_cuda(
2957
const float *__restrict__ rs, const float *__restrict__ ln_cs,
3058
const float *__restrict__ betas, const float *__restrict__ weights,
@@ -66,9 +94,10 @@ __global__ void weighted_minhash_cuda(
6694
}
6795
const float w = logf(weights[index - device_wc_offset]);
6896
const uint32_t d = cols[index - device_wc_offset];
69-
int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
97+
volatile int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
7098
#pragma unroll 4
7199
for (int s = 0; s < sample_delta; s++, ci += d_dim) {
100+
// We apply the logarithm trick here: log (a / z) = log a - log z
72101
float r = rs[ci];
73102
float beta = betas[ci];
74103
float t = floorf(w / r + beta);
@@ -84,22 +113,26 @@ __global__ void weighted_minhash_cuda(
84113

85114
extern "C" {
86115

116+
/// Calls gamma_cuda() kernel.
87117
cudaError_t gamma_(uint32_t size, const float *v1, float *v2) {
88-
dim3 block(1024, 1, 1);
118+
dim3 block(MAX_BLOCK_SIZE, 1, 1);
89119
dim3 grid(size / block.x + 1, 1, 1);
90120
gamma_cuda<<<grid, block>>>(size, v1, v2);
91121
RETERR(cudaDeviceSynchronize());
92122
return cudaSuccess;
93123
}
94124

125+
/// Calls log_cuda() kernel.
95126
cudaError_t log_(uint32_t size, float *v) {
96-
dim3 block(1024, 1, 1);
127+
dim3 block(MAX_BLOCK_SIZE, 1, 1);
97128
dim3 grid(size / block.x + 1, 1, 1);
98129
log_cuda<<<grid, block>>>(size, v);
99130
RETERR(cudaDeviceSynchronize());
100131
return cudaSuccess;
101132
}
102133

134+
/// Copies the number of dimensions (size of each sample) to a symbol on each
135+
/// device.
103136
MHCUDAResult setup_weighted_minhash(
104137
uint32_t dim, const std::vector<int> &devs, int verbosity) {
105138
FOR_EACH_DEV(
@@ -109,6 +142,7 @@ MHCUDAResult setup_weighted_minhash(
109142
return mhcudaSuccess;
110143
}
111144

145+
/// Calls the corresponding kernel.
112146
MHCUDAResult weighted_minhash(
113147
const udevptrs<float> &rs, const udevptrs<float> &ln_cs,
114148
const udevptrs<float> &betas, const udevptrs<float> &weights,
@@ -123,7 +157,7 @@ MHCUDAResult weighted_minhash(
123157
assert(MINHASH_BLOCK_SIZE % spt == 0);
124158
dim3 block(spt, MINHASH_BLOCK_SIZE / spt, 1);
125159
dim3 grid(1, grid_sizes[devi], 1);
126-
auto shmem = 3 * 4 * MINHASH_BLOCK_SIZE * sample_delta;
160+
int shmem = 3 * sizeof(float) * MINHASH_BLOCK_SIZE * sample_delta;
127161
uint32_t row_offset = (devi > 0)? split[devi - 1] : 0;
128162
DEBUG("dev #%d: <<<%d, [%d, %d], %d>>>(%u, %u)\n",
129163
devs[devi], grid.x, block.x, block.y, shmem,

minhashcuda.cc

Lines changed: 68 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,21 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {
5656
INFO("failed to validate device %d", dev);
5757
devs.pop_back();
5858
}
59+
cudaDeviceProp props;
60+
auto err = cudaGetDeviceProperties(&props, dev);
61+
if (err != cudaSuccess) {
62+
INFO("failed to cudaGetDeviceProperties(%d): %s\n",
63+
dev, cudaGetErrorString(err));
64+
devs.pop_back();
65+
}
66+
if (props.major != (CUDA_ARCH / 10) || props.minor != (CUDA_ARCH % 10)) {
67+
INFO("compute capability mismatch for device %d: wanted %d.%d, have "
68+
"%d.%d\n>>>> you may want to build kmcuda with -DCUDA_ARCH=%d "
69+
"(refer to \"Building\" in README.md)\n",
70+
dev, CUDA_ARCH / 10, CUDA_ARCH % 10, props.major, props.minor,
71+
props.major * 10 + props.minor);
72+
devs.pop_back();
73+
}
5974
}
6075
devices >>= 1;
6176
}
@@ -203,27 +218,24 @@ MinhashCudaGenerator *mhcuda_init(
203218
}
204219
auto gen = std::unique_ptr<MinhashCudaGenerator>(
205220
new MinhashCudaGenerator(dim, samples, devs, verbosity));
206-
auto res = mhcuda_init_internal(gen.get(), seed, devs);
207-
if (res != mhcudaSuccess) {
208-
if (status) *status = res;
209-
return nullptr;
210-
}
221+
#define CHECK_SUCCESS(x) do { \
222+
auto res = x; \
223+
if (res != mhcudaSuccess) { \
224+
if (status) *status = res; \
225+
return nullptr; \
226+
} \
227+
} while(false)
228+
CHECK_SUCCESS(mhcuda_init_internal(gen.get(), seed, devs));
211229
if (verbosity > 1) {
212-
res = print_memory_stats(devs);
213-
if (res != mhcudaSuccess) {
214-
if (status) *status = res;
215-
return nullptr;
216-
}
217-
}
218-
res = setup_weighted_minhash(dim, devs, verbosity);
219-
if (res != mhcudaSuccess) {
220-
if (status) *status = res;
221-
return nullptr;
230+
CHECK_SUCCESS(print_memory_stats(devs));
222231
}
232+
CHECK_SUCCESS(setup_weighted_minhash(dim, devs, verbosity));
223233
return gen.release();
234+
#undef CHECK_SUCCESS
224235
}
225236

226-
MinhashCudaGeneratorParameters mhcuda_get_parameters(const MinhashCudaGenerator *gen) {
237+
MinhashCudaGeneratorParameters mhcuda_get_parameters(
238+
const MinhashCudaGenerator *gen) {
227239
if (gen == nullptr) {
228240
return {};
229241
}
@@ -241,9 +253,9 @@ MHCUDAResult mhcuda_retrieve_random_vars(
241253
auto &devs = gen->devs;
242254
size_t const_size = gen->dim * gen->samples * sizeof(float);
243255
CUCH(cudaSetDevice(devs[0]), mhcudaNoSuchDevice);
244-
CUCH(cudaMemcpy(rs, gen->rs[0].get(), const_size, cudaMemcpyDeviceToHost),
256+
CUCH(cudaMemcpyAsync(rs, gen->rs[0].get(), const_size, cudaMemcpyDeviceToHost),
245257
mhcudaMemoryCopyError);
246-
CUCH(cudaMemcpy(ln_cs, gen->ln_cs[0].get(), const_size, cudaMemcpyDeviceToHost),
258+
CUCH(cudaMemcpyAsync(ln_cs, gen->ln_cs[0].get(), const_size, cudaMemcpyDeviceToHost),
247259
mhcudaMemoryCopyError);
248260
CUCH(cudaMemcpy(betas, gen->betas[0].get(), const_size, cudaMemcpyDeviceToHost),
249261
mhcudaMemoryCopyError);
@@ -270,6 +282,20 @@ MHCUDAResult mhcuda_assign_random_vars(
270282
static std::vector<uint32_t> calc_best_split(
271283
const uint32_t *rows, uint32_t length, const std::vector<int> &devs,
272284
const std::vector<uint32_t> &sizes) {
285+
// We need to distribute `length` rows into `devs.size()` devices.
286+
// The number of items is different in every row.
287+
// So we record each 2 possibilities <> the optimal boundary.
288+
// 2 devices -> 2 variants
289+
// 4 -> 8
290+
// 8 -> 128
291+
// 10 -> 512
292+
// 16 -> 32768
293+
// Then the things get tough. The complexity is O(2^(2(n - 1)))
294+
// Hopefully, we will not see more GPUs in a single node soon.
295+
// We evaluate each variant by the cumulative cost function.
296+
// Every call to mhcuda_calc() can grow the buffers a little; the cost function
297+
// optimizes for the number of reallocations first and the imbalance second.
298+
273299
uint32_t ideal_split = rows[length] / devs.size();
274300
std::vector<std::vector<uint32_t>> variants;
275301
for (size_t devi = 0; devi < devs.size(); devi++) {
@@ -299,15 +325,31 @@ static std::vector<uint32_t> calc_best_split(
299325
}
300326
assert(!variants.empty());
301327
std::vector<uint32_t> *best = nullptr;
302-
uint32_t min_cost = 0xFFFFFFFFu;
328+
struct Cost : public std::tuple<uint32_t, uint32_t> {
329+
Cost() = default;
330+
331+
Cost(const std::tuple<uint32_t, uint32_t>& other)
332+
: std::tuple<uint32_t, uint32_t>(other) {}
333+
334+
Cost& operator+=(const std::tuple<uint32_t, uint32_t>& other) {
335+
std::get<0>(*this) += std::get<0>(other);
336+
std::get<1>(*this) += std::get<1>(other);
337+
return *this;
338+
}
339+
};
340+
Cost min_cost = std::make_tuple(0xFFFFFFFFu, 0xFFFFFFFFu);
303341
for (auto &v : variants) {
304-
uint32_t cost = 0;
342+
Cost cost;
305343
for (size_t i = 0; i < devs.size(); i++) {
306344
uint32_t row = v[i], prev_row = (i > 0)? v[i - 1] : 0;
307-
uint32_t diff = rows[row] - rows[prev_row] - sizes[i];
308-
if (diff > 0) {
309-
cost += diff * diff;
310-
}
345+
uint32_t rdelta = rows[row] - rows[prev_row];
346+
uint32_t diff1 = (rdelta > sizes[i])? (rdelta - sizes[i]) : 0;
347+
diff1 *= diff1;
348+
uint32_t diff2 = (rdelta > ideal_split)? (rdelta - ideal_split)
349+
: (ideal_split - rdelta);
350+
diff2 *= diff2;
351+
auto diff = std::make_tuple(diff1, diff2);
352+
cost += diff;
311353
}
312354
if (cost < min_cost) {
313355
best = &v;
@@ -392,6 +434,7 @@ static void binpack(
392434
const MinhashCudaGenerator *gen, const uint32_t *rows,
393435
const std::vector<uint32_t> &split, const std::vector<int> &sample_deltas,
394436
std::vector<std::vector<int32_t>> *plans, std::vector<uint32_t> *grid_sizes) {
437+
// https://blog.sourced.tech/post/minhashcuda/
395438
const int32_t ideal_binavgcount = 20;
396439
auto &devs = gen->devs;
397440
int verbosity = gen->verbosity;
@@ -523,7 +566,7 @@ MHCUDAResult mhcuda_calc(
523566
rows, length, output);
524567
auto &devs = gen->devs;
525568
INFO("Preparing...\n");
526-
std::vector<uint32_t> split = calc_best_split(rows, length, gen->devs, gen->sizes);
569+
auto split = calc_best_split(rows, length, gen->devs, gen->sizes);
527570
if (verbosity > 1) {
528571
dump_vector(split, "split");
529572
}

0 commit comments

Comments
 (0)