src-d
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 3 additions & 1 deletion b/‎README.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎doc/Doxyfile
Lines changed: 13 additions & 0 deletions b/‎doc/Doxyfile
Lines changed: 13 additions & 0 deletions
diff --git a/‎kernel.cu
Lines changed: 44 additions & 10 deletions b/‎kernel.cu
Lines changed: 44 additions & 10 deletions
diff --git a/‎minhashcuda.cc
Lines changed: 68 additions & 25 deletions b/‎minhashcuda.cc
Lines changed: 68 additions & 25 deletions
@@ -14,14 +14,17 @@ if (NOT DEFINED CUDA_ARCH)
   set(CUDA_ARCH "61")
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
+if (DEBUGINFO)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+endif()
 set(SOURCE_FILES minhashcuda.cc minhashcuda.h wrappers.h private.h kernel.cu)
 if (NOT DISABLE_PYTHON)
   list(APPEND SOURCE_FILES python.cc)
 endif()
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
   set(NVCC_FLAGS "-G -g")
 endif()
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=compute_${CUDA_ARCH} -Xptxas=-v -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CUDA_ARCH} -Xptxas=-v -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
 if (CMAKE_MAJOR_VERSION LESS 4 AND CMAKE_MINOR_VERSION LESS 3)
   # workaround https://github.com/Kitware/CMake/commit/99abebdea01b9ef73e091db5594553f7b1694a1b
   message(STATUS "Applied CUDA C++11 workaround on CMake < 3.3")
 
@@ -77,7 +77,7 @@ del data
 # Initialize the hasher aka "generator" with 128 hash samples for every row
 gen = libMHCUDA.minhash_cuda_init(m.shape[-1], 128, seed=1, verbosity=1)
 
-# Calculate thr hashes. Can be executed several times with different number of rows
+# Calculate the hashes. Can be executed several times with different number of rows
 hashes = libMHCUDA.minhash_cuda_calc(gen, m)
 
 # Free the resources
@@ -198,3 +198,5 @@ Generator pointer is invalidated.
 License
 -------
 MIT license.
+
+#### README {#ignore_this_doxygen_anchor}
@@ -0,0 +1,13 @@
+INPUT = ../
+FILE_PATTERNS = *.h *.cc *.cu *.md
+EXTENSION_MAPPING = cu=C++
+EXTRACT_ALL = YES
+EXTRACT_ANON_NSPACES = YES
+EXCLUDE_PATTERNS = *.py
+DOXYFILE_ENCODING = UTF-8
+PROJECT_NAME = MinHashCUDA
+OUTPUT_LANGUAGE = English
+GENERATE_XML = NO
+GENERATE_LATEX = NO
+GENERATE_HTML = YES
+HTML_OUTPUT = doxyhtml/
@@ -2,16 +2,31 @@
 #include <cfloat>
 #include "private.h"
 
+#define MAX_BLOCK_SIZE 1024
+
+/// The number of dimensions. Constant on every device.
 __constant__ uint32_t d_dim;
 
-__global__ void gamma_cuda(uint32_t size, const float *__restrict__ v1, float *v2) {
+/// Calculates the gamma distribution of the specified size from two uniform
+/// distributions.
+/// @param size The number of samples to write.
+/// @param v1 in The first array with uniformly distributed values in [0, 1].
+/// @param v2 in,out The second array with uniformly distributed values in [0, 1].
+///                  The output is written to it.
+/// @note v1 and v2 must be independent (e.g., not the same), otherwise you will
+/// get an invalid result.
+__global__ void gamma_cuda(uint32_t size, const float *__restrict__ v1,
+                           float *__restrict__ v2) {
   uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index >= size) {
     return;
   }
   v2[index] = -logf(v1[index] * v2[index]);
 }
 
+/// Calculates the natural logarithm of the array.
+/// @param size The length of the array.
+/// @param v in,out The array to read and write.
 __global__ void log_cuda(uint32_t size, float *v) {
   uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index >= size) {
@@ -20,11 +35,24 @@ __global__ void log_cuda(uint32_t size, float *v) {
   v[index] = logf(v[index]);
 }
 
-/*
-  weights, cols, rows - CSR format
-  plan - execution plan, consists of 2 parts: first is offset table and
-         the second is the row indices
-*/
+/// Weighted MinHash kernel. The argument names follow the paper:
+/// http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf
+/// @param rs Gamma(2,1)-random samples. The length must be the product of
+///           number of processed samples (vectors) by the number of dimensions.
+/// @param ln_cs Logarithm over the gamma(2,1) distribution. Same length as rs.
+/// @param betas Uniformly [0, 1] distributed samples. Same length as rs.
+/// @param weights CSR's data.
+/// @param cols CSR's indices.
+/// @param rows CSR's indptrs.
+/// @param plan Execution plan, consists of 2 parts: the first is the offset
+///             table and the second is the row indices
+/// @param sample_delta How many hashes to process in a single thread. Depends
+///                     on the shared memory size.
+/// @param device_row_offset Shard offset in rows. Specific to every device.
+/// @param device_wc_offset Shard offset in weights and cols. Specific to every
+///                         device.
+/// @param hashes The output of size number of vectors x number of hashes for
+///               each x 2.
 __global__ void weighted_minhash_cuda(
     const float *__restrict__ rs, const float *__restrict__ ln_cs,
     const float *__restrict__ betas, const float *__restrict__ weights,
@@ -66,9 +94,10 @@ __global__ void weighted_minhash_cuda(
     }
     const float w = logf(weights[index - device_wc_offset]);
     const uint32_t d = cols[index - device_wc_offset];
-    int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
+    volatile int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
     #pragma unroll 4
     for (int s = 0; s < sample_delta; s++, ci += d_dim) {
+      // We apply the logarithm trick here: log (a / z) = log a - log z
       float r = rs[ci];
       float beta = betas[ci];
       float t = floorf(w / r + beta);
@@ -84,22 +113,26 @@ __global__ void weighted_minhash_cuda(
 
 extern "C" {
 
+/// Calls gamma_cuda() kernel.
 cudaError_t gamma_(uint32_t size, const float *v1, float *v2) {
-  dim3 block(1024, 1, 1);
+  dim3 block(MAX_BLOCK_SIZE, 1, 1);
   dim3 grid(size / block.x + 1, 1, 1);
   gamma_cuda<<<grid, block>>>(size, v1, v2);
   RETERR(cudaDeviceSynchronize());
   return cudaSuccess;
 }
 
+/// Calls log_cuda() kernel.
 cudaError_t log_(uint32_t size, float *v) {
-  dim3 block(1024, 1, 1);
+  dim3 block(MAX_BLOCK_SIZE, 1, 1);
   dim3 grid(size / block.x + 1, 1, 1);
   log_cuda<<<grid, block>>>(size, v);
   RETERR(cudaDeviceSynchronize());
   return cudaSuccess;
 }
 
+/// Copies the number of dimensions (size of each sample) to a symbol on each
+/// device.
 MHCUDAResult setup_weighted_minhash(
     uint32_t dim, const std::vector<int> &devs, int verbosity) {
   FOR_EACH_DEV(
@@ -109,6 +142,7 @@ MHCUDAResult setup_weighted_minhash(
   return mhcudaSuccess;
 }
 
+/// Calls the corresponding kernel.
 MHCUDAResult weighted_minhash(
     const udevptrs<float> &rs, const udevptrs<float> &ln_cs,
     const udevptrs<float> &betas, const udevptrs<float> &weights,
@@ -123,7 +157,7 @@ MHCUDAResult weighted_minhash(
     assert(MINHASH_BLOCK_SIZE % spt == 0);
     dim3 block(spt, MINHASH_BLOCK_SIZE / spt, 1);
     dim3 grid(1, grid_sizes[devi], 1);
-    auto shmem = 3 * 4 * MINHASH_BLOCK_SIZE * sample_delta;
+    int shmem = 3 * sizeof(float) * MINHASH_BLOCK_SIZE * sample_delta;
     uint32_t row_offset = (devi > 0)? split[devi - 1] : 0;
     DEBUG("dev #%d: <<<%d, [%d, %d], %d>>>(%u, %u)\n",
           devs[devi], grid.x, block.x, block.y, shmem,
 
@@ -56,6 +56,21 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {
         INFO("failed to validate device %d", dev);
         devs.pop_back();
       }
+      cudaDeviceProp props;
+      auto err = cudaGetDeviceProperties(&props, dev);
+      if (err != cudaSuccess) {
+        INFO("failed to cudaGetDeviceProperties(%d): %s\n",
+             dev, cudaGetErrorString(err));
+        devs.pop_back();
+      }
+      if (props.major != (CUDA_ARCH / 10) || props.minor != (CUDA_ARCH % 10)) {
+        INFO("compute capability mismatch for device %d: wanted %d.%d, have "
+             "%d.%d\n>>>> you may want to build kmcuda with -DCUDA_ARCH=%d "
+             "(refer to \"Building\" in README.md)\n",
+             dev, CUDA_ARCH / 10, CUDA_ARCH % 10, props.major, props.minor,
+             props.major * 10 + props.minor);
+        devs.pop_back();
+      }
     }
     devices >>= 1;
   }
@@ -203,27 +218,24 @@ MinhashCudaGenerator *mhcuda_init(
   }
   auto gen = std::unique_ptr<MinhashCudaGenerator>(
       new MinhashCudaGenerator(dim, samples, devs, verbosity));
-  auto res = mhcuda_init_internal(gen.get(), seed, devs);
-  if (res != mhcudaSuccess) {
-    if (status) *status = res;
-    return nullptr;
-  }
+  #define CHECK_SUCCESS(x) do { \
+    auto res = x; \
+    if (res != mhcudaSuccess) { \
+      if (status) *status = res; \
+      return nullptr; \
+    } \
+  } while(false)
+  CHECK_SUCCESS(mhcuda_init_internal(gen.get(), seed, devs));
   if (verbosity > 1) {
-    res = print_memory_stats(devs);
-    if (res != mhcudaSuccess) {
-      if (status) *status = res;
-      return nullptr;
-    }
-  }
-  res = setup_weighted_minhash(dim, devs, verbosity);
-  if (res != mhcudaSuccess) {
-    if (status) *status = res;
-    return nullptr;
+    CHECK_SUCCESS(print_memory_stats(devs));
   }
+  CHECK_SUCCESS(setup_weighted_minhash(dim, devs, verbosity));
   return gen.release();
+  #undef CHECK_SUCCESS
 }
 
-MinhashCudaGeneratorParameters mhcuda_get_parameters(const MinhashCudaGenerator *gen) {
+MinhashCudaGeneratorParameters mhcuda_get_parameters(
+    const MinhashCudaGenerator *gen) {
   if (gen == nullptr) {
     return {};
   }
@@ -241,9 +253,9 @@ MHCUDAResult mhcuda_retrieve_random_vars(
   auto &devs = gen->devs;
   size_t const_size = gen->dim * gen->samples * sizeof(float);
   CUCH(cudaSetDevice(devs[0]), mhcudaNoSuchDevice);
-  CUCH(cudaMemcpy(rs, gen->rs[0].get(), const_size, cudaMemcpyDeviceToHost),
+  CUCH(cudaMemcpyAsync(rs, gen->rs[0].get(), const_size, cudaMemcpyDeviceToHost),
        mhcudaMemoryCopyError);
-  CUCH(cudaMemcpy(ln_cs, gen->ln_cs[0].get(), const_size, cudaMemcpyDeviceToHost),
+  CUCH(cudaMemcpyAsync(ln_cs, gen->ln_cs[0].get(), const_size, cudaMemcpyDeviceToHost),
        mhcudaMemoryCopyError);
   CUCH(cudaMemcpy(betas, gen->betas[0].get(), const_size, cudaMemcpyDeviceToHost),
        mhcudaMemoryCopyError);
@@ -270,6 +282,20 @@ MHCUDAResult mhcuda_assign_random_vars(
 static std::vector<uint32_t> calc_best_split(
     const uint32_t *rows, uint32_t length, const std::vector<int> &devs,
     const std::vector<uint32_t> &sizes) {
+  // We need to distribute `length` rows into `devs.size()` devices.
+  // The number of items is different in every row.
+  // So we record each 2 possibilities <> the optimal boundary.
+  // 2 devices  -> 2 variants
+  // 4 -> 8
+  // 8 -> 128
+  // 10 -> 512
+  // 16 -> 32768
+  // Then the things get tough. The complexity is O(2^(2(n - 1)))
+  // Hopefully, we will not see more GPUs in a single node soon.
+  // We evaluate each variant by the cumulative cost function.
+  // Every call to mhcuda_calc() can grow the buffers a little; the cost function
+  // optimizes for the number of reallocations first and the imbalance second.
+
   uint32_t ideal_split = rows[length] / devs.size();
   std::vector<std::vector<uint32_t>> variants;
   for (size_t devi = 0; devi < devs.size(); devi++) {
@@ -299,15 +325,31 @@ static std::vector<uint32_t> calc_best_split(
   }
   assert(!variants.empty());
   std::vector<uint32_t> *best = nullptr;
-  uint32_t min_cost = 0xFFFFFFFFu;
+  struct Cost : public std::tuple<uint32_t, uint32_t> {
+    Cost() = default;
+
+    Cost(const std::tuple<uint32_t, uint32_t>& other)
+        : std::tuple<uint32_t, uint32_t>(other) {}
+
+    Cost& operator+=(const std::tuple<uint32_t, uint32_t>& other) {
+      std::get<0>(*this) += std::get<0>(other);
+      std::get<1>(*this) += std::get<1>(other);
+      return *this;
+    }
+  };
+  Cost min_cost = std::make_tuple(0xFFFFFFFFu, 0xFFFFFFFFu);
   for (auto &v : variants) {
-    uint32_t cost = 0;
+    Cost cost;
     for (size_t i = 0; i < devs.size(); i++) {
       uint32_t row = v[i], prev_row = (i > 0)? v[i - 1] : 0;
-      uint32_t diff = rows[row] - rows[prev_row] - sizes[i];
-      if (diff > 0) {
-        cost += diff * diff;
-      }
+      uint32_t rdelta = rows[row] - rows[prev_row];
+      uint32_t diff1 = (rdelta > sizes[i])? (rdelta - sizes[i]) : 0;
+      diff1 *= diff1;
+      uint32_t diff2 = (rdelta > ideal_split)? (rdelta - ideal_split)
+                                             : (ideal_split - rdelta);
+      diff2 *= diff2;
+      auto diff = std::make_tuple(diff1, diff2);
+      cost += diff;
     }
     if (cost < min_cost) {
       best = &v;
@@ -392,6 +434,7 @@ static void binpack(
     const MinhashCudaGenerator *gen, const uint32_t *rows,
     const std::vector<uint32_t> &split, const std::vector<int> &sample_deltas,
     std::vector<std::vector<int32_t>> *plans, std::vector<uint32_t> *grid_sizes) {
+  // https://blog.sourced.tech/post/minhashcuda/
   const int32_t ideal_binavgcount = 20;
   auto &devs = gen->devs;
   int verbosity = gen->verbosity;
@@ -523,7 +566,7 @@ MHCUDAResult mhcuda_calc(
         rows, length, output);
   auto &devs = gen->devs;
   INFO("Preparing...\n");
-  std::vector<uint32_t> split = calc_best_split(rows, length, gen->devs, gen->sizes);
+  auto split = calc_best_split(rows, length, gen->devs, gen->sizes);
   if (verbosity > 1) {
     dump_vector(split, "split");
   }