@@ -56,6 +56,21 @@ static std::vector<int> setup_devices(uint32_t devices, int verbosity) {
56
56
INFO (" failed to validate device %d" , dev);
57
57
devs.pop_back ();
58
58
}
59
+ cudaDeviceProp props;
60
+ auto err = cudaGetDeviceProperties (&props, dev);
61
+ if (err != cudaSuccess) {
62
+ INFO (" failed to cudaGetDeviceProperties(%d): %s\n " ,
63
+ dev, cudaGetErrorString (err));
64
+ devs.pop_back ();
65
+ }
66
+ if (props.major != (CUDA_ARCH / 10 ) || props.minor != (CUDA_ARCH % 10 )) {
67
+ INFO (" compute capability mismatch for device %d: wanted %d.%d, have "
68
+ " %d.%d\n >>>> you may want to build kmcuda with -DCUDA_ARCH=%d "
69
+ " (refer to \" Building\" in README.md)\n " ,
70
+ dev, CUDA_ARCH / 10 , CUDA_ARCH % 10 , props.major , props.minor ,
71
+ props.major * 10 + props.minor );
72
+ devs.pop_back ();
73
+ }
59
74
}
60
75
devices >>= 1 ;
61
76
}
@@ -203,27 +218,24 @@ MinhashCudaGenerator *mhcuda_init(
203
218
}
204
219
auto gen = std::unique_ptr<MinhashCudaGenerator>(
205
220
new MinhashCudaGenerator (dim, samples, devs, verbosity));
206
- auto res = mhcuda_init_internal (gen.get (), seed, devs);
207
- if (res != mhcudaSuccess) {
208
- if (status) *status = res;
209
- return nullptr ;
210
- }
221
+ #define CHECK_SUCCESS (x ) do { \
222
+ auto res = x; \
223
+ if (res != mhcudaSuccess) { \
224
+ if (status) *status = res; \
225
+ return nullptr ; \
226
+ } \
227
+ } while (false )
228
+ CHECK_SUCCESS (mhcuda_init_internal (gen.get (), seed, devs));
211
229
if (verbosity > 1 ) {
212
- res = print_memory_stats (devs);
213
- if (res != mhcudaSuccess) {
214
- if (status) *status = res;
215
- return nullptr ;
216
- }
217
- }
218
- res = setup_weighted_minhash (dim, devs, verbosity);
219
- if (res != mhcudaSuccess) {
220
- if (status) *status = res;
221
- return nullptr ;
230
+ CHECK_SUCCESS (print_memory_stats (devs));
222
231
}
232
+ CHECK_SUCCESS (setup_weighted_minhash (dim, devs, verbosity));
223
233
return gen.release ();
234
+ #undef CHECK_SUCCESS
224
235
}
225
236
226
- MinhashCudaGeneratorParameters mhcuda_get_parameters (const MinhashCudaGenerator *gen) {
237
+ MinhashCudaGeneratorParameters mhcuda_get_parameters (
238
+ const MinhashCudaGenerator *gen) {
227
239
if (gen == nullptr ) {
228
240
return {};
229
241
}
@@ -241,9 +253,9 @@ MHCUDAResult mhcuda_retrieve_random_vars(
241
253
auto &devs = gen->devs ;
242
254
size_t const_size = gen->dim * gen->samples * sizeof (float );
243
255
CUCH (cudaSetDevice (devs[0 ]), mhcudaNoSuchDevice);
244
- CUCH (cudaMemcpy (rs, gen->rs [0 ].get (), const_size, cudaMemcpyDeviceToHost),
256
+ CUCH (cudaMemcpyAsync (rs, gen->rs [0 ].get (), const_size, cudaMemcpyDeviceToHost),
245
257
mhcudaMemoryCopyError);
246
- CUCH (cudaMemcpy (ln_cs, gen->ln_cs [0 ].get (), const_size, cudaMemcpyDeviceToHost),
258
+ CUCH (cudaMemcpyAsync (ln_cs, gen->ln_cs [0 ].get (), const_size, cudaMemcpyDeviceToHost),
247
259
mhcudaMemoryCopyError);
248
260
CUCH (cudaMemcpy (betas, gen->betas [0 ].get (), const_size, cudaMemcpyDeviceToHost),
249
261
mhcudaMemoryCopyError);
@@ -270,6 +282,20 @@ MHCUDAResult mhcuda_assign_random_vars(
270
282
static std::vector<uint32_t > calc_best_split (
271
283
const uint32_t *rows, uint32_t length, const std::vector<int > &devs,
272
284
const std::vector<uint32_t > &sizes) {
285
+ // We need to distribute `length` rows into `devs.size()` devices.
286
+ // The number of items is different in every row.
287
+ // So we record each 2 possibilities <> the optimal boundary.
288
+ // 2 devices -> 2 variants
289
+ // 4 -> 8
290
+ // 8 -> 128
291
+ // 10 -> 512
292
+ // 16 -> 32768
293
+ // Then the things get tough. The complexity is O(2^(2(n - 1)))
294
+ // Hopefully, we will not see more GPUs in a single node soon.
295
+ // We evaluate each variant by the cumulative cost function.
296
+ // Every call to mhcuda_calc() can grow the buffers a little; the cost function
297
+ // optimizes for the number of reallocations first and the imbalance second.
298
+
273
299
uint32_t ideal_split = rows[length] / devs.size ();
274
300
std::vector<std::vector<uint32_t >> variants;
275
301
for (size_t devi = 0 ; devi < devs.size (); devi++) {
@@ -299,15 +325,31 @@ static std::vector<uint32_t> calc_best_split(
299
325
}
300
326
assert (!variants.empty ());
301
327
std::vector<uint32_t > *best = nullptr ;
302
- uint32_t min_cost = 0xFFFFFFFFu ;
328
+ struct Cost : public std ::tuple<uint32_t , uint32_t > {
329
+ Cost () = default ;
330
+
331
+ Cost (const std::tuple<uint32_t , uint32_t >& other)
332
+ : std::tuple<uint32_t , uint32_t >(other) {}
333
+
334
+ Cost& operator +=(const std::tuple<uint32_t , uint32_t >& other) {
335
+ std::get<0 >(*this ) += std::get<0 >(other);
336
+ std::get<1 >(*this ) += std::get<1 >(other);
337
+ return *this ;
338
+ }
339
+ };
340
+ Cost min_cost = std::make_tuple (0xFFFFFFFFu , 0xFFFFFFFFu );
303
341
for (auto &v : variants) {
304
- uint32_t cost = 0 ;
342
+ Cost cost;
305
343
for (size_t i = 0 ; i < devs.size (); i++) {
306
344
uint32_t row = v[i], prev_row = (i > 0 )? v[i - 1 ] : 0 ;
307
- uint32_t diff = rows[row] - rows[prev_row] - sizes[i];
308
- if (diff > 0 ) {
309
- cost += diff * diff;
310
- }
345
+ uint32_t rdelta = rows[row] - rows[prev_row];
346
+ uint32_t diff1 = (rdelta > sizes[i])? (rdelta - sizes[i]) : 0 ;
347
+ diff1 *= diff1;
348
+ uint32_t diff2 = (rdelta > ideal_split)? (rdelta - ideal_split)
349
+ : (ideal_split - rdelta);
350
+ diff2 *= diff2;
351
+ auto diff = std::make_tuple (diff1, diff2);
352
+ cost += diff;
311
353
}
312
354
if (cost < min_cost) {
313
355
best = &v;
@@ -392,6 +434,7 @@ static void binpack(
392
434
const MinhashCudaGenerator *gen, const uint32_t *rows,
393
435
const std::vector<uint32_t > &split, const std::vector<int > &sample_deltas,
394
436
std::vector<std::vector<int32_t >> *plans, std::vector<uint32_t > *grid_sizes) {
437
+ // https://blog.sourced.tech/post/minhashcuda/
395
438
const int32_t ideal_binavgcount = 20 ;
396
439
auto &devs = gen->devs ;
397
440
int verbosity = gen->verbosity ;
@@ -523,7 +566,7 @@ MHCUDAResult mhcuda_calc(
523
566
rows, length, output);
524
567
auto &devs = gen->devs ;
525
568
INFO (" Preparing...\n " );
526
- std::vector< uint32_t > split = calc_best_split (rows, length, gen->devs , gen->sizes );
569
+ auto split = calc_best_split (rows, length, gen->devs , gen->sizes );
527
570
if (verbosity > 1 ) {
528
571
dump_vector (split, " split" );
529
572
}
0 commit comments