Reduce the number of registers 35->31

vmarkovtsev · vmarkovtsev · commit 5d472f6524db · 2017-11-05T17:58:33.000+01:00
diff --git a/kernel.cu b/kernel.cu
@@ -40,7 +40,7 @@ __global__ void log_cuda(uint32_t size, float *v) {
 /// @param rs Gamma(2,1)-random samples. The length must be the product of
 ///           number of processed samples (vectors) by the number of dimensions.
 /// @param ln_cs Logarithm over the gamma(2,1) distribution. Same length as rs.
-/// @param betas Uniformly [0,1] distributed samples. Same length as rs.
+/// @param betas Uniformly [0, 1] distributed samples. Same length as rs.
 /// @param weights CSR's data.
 /// @param cols CSR's indices.
 /// @param rows CSR's indptrs.
@@ -94,7 +94,7 @@ __global__ void weighted_minhash_cuda(
     }
     const float w = logf(weights[index - device_wc_offset]);
     const uint32_t d = cols[index - device_wc_offset];
-    int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
+    volatile int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
     #pragma unroll 4
     for (int s = 0; s < sample_delta; s++, ci += d_dim) {
       // We apply the logarithm trick here: log (a / z) = log a - log z
diff --git a/setup.py b/setup.py
@@ -46,7 +46,7 @@ def is_pure(self):
 setup(
     name="libMHCUDA",
     description="Accelerated Weighted MinHash-ing on GPU",
-    version="1.1.2",
+    version="1.1.3",
     license="MIT",
     author="Vadim Markovtsev",
     author_email="vadim@sourced.tech",