Skip to content

Commit 5d472f6

Browse files
committed
Reduce the number of registers 35->31
1 parent f52ed95 commit 5d472f6

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

kernel.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ __global__ void log_cuda(uint32_t size, float *v) {
4040
/// @param rs Gamma(2,1)-random samples. The length must be the product of
4141
/// number of processed samples (vectors) by the number of dimensions.
4242
/// @param ln_cs Logarithm over the gamma(2,1) distribution. Same length as rs.
43-
/// @param betas Uniformly [0,1] distributed samples. Same length as rs.
43+
/// @param betas Uniformly [0, 1] distributed samples. Same length as rs.
4444
/// @param weights CSR's data.
4545
/// @param cols CSR's indices.
4646
/// @param rows CSR's indptrs.
@@ -94,7 +94,7 @@ __global__ void weighted_minhash_cuda(
9494
}
9595
const float w = logf(weights[index - device_wc_offset]);
9696
const uint32_t d = cols[index - device_wc_offset];
97-
int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
97+
volatile int64_t ci = static_cast<int64_t>(sample_offset) * d_dim + d;
9898
#pragma unroll 4
9999
for (int s = 0; s < sample_delta; s++, ci += d_dim) {
100100
// We apply the logarithm trick here: log (a / z) = log a - log z

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def is_pure(self):
4646
setup(
4747
name="libMHCUDA",
4848
description="Accelerated Weighted MinHash-ing on GPU",
49-
version="1.1.2",
49+
version="1.1.3",
5050
license="MIT",
5151
author="Vadim Markovtsev",
5252
author_email="[email protected]",

0 commit comments

Comments
 (0)