@@ -111,49 +111,67 @@ struct Fused_multihead_attention_params_v2
111
111
// //////////////////////////////////////////////////////////////////////////////////////////////////
112
112
extern unsigned char fused_multihead_attention_v2_fp16_128_64_kernel_sm75_cubin[];
113
113
extern unsigned char fused_multihead_attention_v2_fp16_128_64_kernel_sm80_cubin[];
114
+ extern unsigned char fused_multihead_attention_v2_fp16_128_64_kernel_sm86_cubin[];
114
115
extern unsigned char fused_multihead_attention_v2_fp16_256_64_kernel_sm75_cubin[];
115
116
extern unsigned char fused_multihead_attention_v2_fp16_256_64_kernel_sm80_cubin[];
117
+ extern unsigned char fused_multihead_attention_v2_fp16_256_64_kernel_sm86_cubin[];
116
118
extern unsigned char fused_multihead_attention_v2_fp16_384_64_kernel_sm75_cubin[];
117
119
extern unsigned char fused_multihead_attention_v2_fp16_384_64_kernel_sm80_cubin[];
120
+ extern unsigned char fused_multihead_attention_v2_fp16_384_64_kernel_sm86_cubin[];
118
121
extern unsigned char fused_multihead_attention_v2_fp16_64_64_kernel_sm75_cubin[];
119
122
extern unsigned char fused_multihead_attention_v2_fp16_64_64_kernel_sm80_cubin[];
123
+ extern unsigned char fused_multihead_attention_v2_fp16_64_64_kernel_sm86_cubin[];
120
124
extern unsigned char fused_multihead_attention_v2_fp16_96_64_kernel_sm75_cubin[];
121
125
extern unsigned char fused_multihead_attention_v2_fp16_96_64_kernel_sm80_cubin[];
126
+ extern unsigned char fused_multihead_attention_v2_fp16_96_64_kernel_sm86_cubin[];
122
127
extern unsigned char fused_multihead_attention_v2_int8_128_64_kernel_cubin[];
123
128
extern unsigned char fused_multihead_attention_v2_int8_128_64_kernel_sm75_cubin[];
124
129
extern unsigned char fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin[];
130
+ extern unsigned char fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin[];
125
131
extern unsigned char fused_multihead_attention_v2_int8_192_64_kernel_cubin[];
126
132
extern unsigned char fused_multihead_attention_v2_int8_192_64_kernel_sm75_cubin[];
127
133
extern unsigned char fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin[];
134
+ extern unsigned char fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin[];
128
135
extern unsigned char fused_multihead_attention_v2_int8_256_64_kernel_cubin[];
129
136
extern unsigned char fused_multihead_attention_v2_int8_256_64_kernel_sm75_cubin[];
130
137
extern unsigned char fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin[];
138
+ extern unsigned char fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin[];
131
139
extern unsigned char fused_multihead_attention_v2_int8_384_64_kernel_cubin[];
132
140
extern unsigned char fused_multihead_attention_v2_int8_384_64_kernel_sm75_cubin[];
133
141
extern unsigned char fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin[];
142
+ extern unsigned char fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin[];
134
143
135
144
extern unsigned int fused_multihead_attention_v2_fp16_128_64_kernel_sm75_cubin_len;
136
145
extern unsigned int fused_multihead_attention_v2_fp16_128_64_kernel_sm80_cubin_len;
146
+ extern unsigned int fused_multihead_attention_v2_fp16_128_64_kernel_sm86_cubin_len;
137
147
extern unsigned int fused_multihead_attention_v2_fp16_256_64_kernel_sm75_cubin_len;
138
148
extern unsigned int fused_multihead_attention_v2_fp16_256_64_kernel_sm80_cubin_len;
149
+ extern unsigned int fused_multihead_attention_v2_fp16_256_64_kernel_sm86_cubin_len;
139
150
extern unsigned int fused_multihead_attention_v2_fp16_384_64_kernel_sm75_cubin_len;
140
151
extern unsigned int fused_multihead_attention_v2_fp16_384_64_kernel_sm80_cubin_len;
152
+ extern unsigned int fused_multihead_attention_v2_fp16_384_64_kernel_sm86_cubin_len;
141
153
extern unsigned int fused_multihead_attention_v2_fp16_64_64_kernel_sm75_cubin_len;
142
154
extern unsigned int fused_multihead_attention_v2_fp16_64_64_kernel_sm80_cubin_len;
155
+ extern unsigned int fused_multihead_attention_v2_fp16_64_64_kernel_sm86_cubin_len;
143
156
extern unsigned int fused_multihead_attention_v2_fp16_96_64_kernel_sm75_cubin_len;
144
157
extern unsigned int fused_multihead_attention_v2_fp16_96_64_kernel_sm80_cubin_len;
158
+ extern unsigned int fused_multihead_attention_v2_fp16_96_64_kernel_sm86_cubin_len;
145
159
extern unsigned int fused_multihead_attention_v2_int8_128_64_kernel_cubin_len;
146
160
extern unsigned int fused_multihead_attention_v2_int8_128_64_kernel_sm75_cubin_len;
147
161
extern unsigned int fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin_len;
162
+ extern unsigned int fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin_len;
148
163
extern unsigned int fused_multihead_attention_v2_int8_192_64_kernel_cubin_len;
149
164
extern unsigned int fused_multihead_attention_v2_int8_192_64_kernel_sm75_cubin_len;
150
165
extern unsigned int fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin_len;
166
+ extern unsigned int fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin_len;
151
167
extern unsigned int fused_multihead_attention_v2_int8_256_64_kernel_cubin_len;
152
168
extern unsigned int fused_multihead_attention_v2_int8_256_64_kernel_sm75_cubin_len;
153
169
extern unsigned int fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin_len;
170
+ extern unsigned int fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin_len;
154
171
extern unsigned int fused_multihead_attention_v2_int8_384_64_kernel_cubin_len;
155
172
extern unsigned int fused_multihead_attention_v2_int8_384_64_kernel_sm75_cubin_len;
156
173
extern unsigned int fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin_len;
174
+ extern unsigned int fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin_len;
157
175
158
176
static const struct FusedMultiHeadAttentionKernelMetaInfoV2
159
177
{
@@ -348,72 +366,78 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2
348
366
349
367
// GA10x
350
368
// Note: For GA10X keep only kernels whose sharedMemBytes < 100KiB
351
- {DATA_TYPE_FP16, 64 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_64_64_kernel_sm80_cubin ,
352
- fused_multihead_attention_v2_fp16_64_64_kernel_sm80_cubin_len ,
369
+ {DATA_TYPE_FP16, 64 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_64_64_kernel_sm86_cubin ,
370
+ fused_multihead_attention_v2_fp16_64_64_kernel_sm86_cubin_len ,
353
371
" fused_multihead_attention_v2_fp16_64_64_kernel_sm80" , 32768 , 128 , 0 , false },
354
- {DATA_TYPE_FP16, 96 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_96_64_kernel_sm80_cubin ,
355
- fused_multihead_attention_v2_fp16_96_64_kernel_sm80_cubin_len ,
372
+ {DATA_TYPE_FP16, 96 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_96_64_kernel_sm86_cubin ,
373
+ fused_multihead_attention_v2_fp16_96_64_kernel_sm86_cubin_len ,
356
374
" fused_multihead_attention_v2_fp16_96_64_kernel_sm80" , 49152 , 128 , 0 , false },
357
- {DATA_TYPE_FP16, 128 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_128_64_kernel_sm80_cubin ,
358
- fused_multihead_attention_v2_fp16_128_64_kernel_sm80_cubin_len ,
375
+ {DATA_TYPE_FP16, 128 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_128_64_kernel_sm86_cubin ,
376
+ fused_multihead_attention_v2_fp16_128_64_kernel_sm86_cubin_len ,
359
377
" fused_multihead_attention_v2_fp16_128_64_kernel_sm80_noloop" , 40960 , 128 , 32 , false },
360
- {DATA_TYPE_FP16, 128 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_128_64_kernel_sm80_cubin ,
361
- fused_multihead_attention_v2_fp16_128_64_kernel_sm80_cubin_len ,
378
+ {DATA_TYPE_FP16, 128 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_128_64_kernel_sm86_cubin ,
379
+ fused_multihead_attention_v2_fp16_128_64_kernel_sm86_cubin_len ,
362
380
" fused_multihead_attention_v2_fp16_128_64_kernel_sm80" , 65536 , 128 , 0 , false },
363
- {DATA_TYPE_FP16, 256 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_256_64_kernel_sm80_cubin ,
364
- fused_multihead_attention_v2_fp16_256_64_kernel_sm80_cubin_len ,
381
+ {DATA_TYPE_FP16, 256 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_256_64_kernel_sm86_cubin ,
382
+ fused_multihead_attention_v2_fp16_256_64_kernel_sm86_cubin_len ,
365
383
" fused_multihead_attention_v2_fp16_256_64_kernel_sm80_noloop" , 73728 , 128 , 32 , false },
366
- {DATA_TYPE_FP16, 256 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_256_64_kernel_sm80_cubin ,
367
- fused_multihead_attention_v2_fp16_256_64_kernel_sm80_cubin_len ,
384
+ {DATA_TYPE_FP16, 256 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_256_64_kernel_sm86_cubin ,
385
+ fused_multihead_attention_v2_fp16_256_64_kernel_sm86_cubin_len ,
368
386
" fused_multihead_attention_v2_fp16_256_64_kernel_sm80" , 73728 , 128 , 0 , false },
369
-
370
- {DATA_TYPE_INT8, 128 , 64 , kSM_86 , fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin,
371
- fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin_len,
387
+ {DATA_TYPE_FP16, 384 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_384_64_kernel_sm86_cubin,
388
+ fused_multihead_attention_v2_fp16_384_64_kernel_sm86_cubin_len,
389
+ " fused_multihead_attention_v2_fp16_384_64_kernel_sm80_noloop" , 65536 , 256 , 48 , false },
390
+ {DATA_TYPE_FP16, 384 , 64 , kSM_86 , fused_multihead_attention_v2_fp16_384_64_kernel_sm86_cubin,
391
+ fused_multihead_attention_v2_fp16_384_64_kernel_sm86_cubin_len,
392
+ " fused_multihead_attention_v2_fp16_384_64_kernel_sm80" , 65536 , 256 , 0 , false },
393
+
394
+ {DATA_TYPE_INT8, 128 , 64 , kSM_86 , fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin,
395
+ fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin_len,
372
396
" fused_multihead_attention_v2_int8_128_64_kernel_sm80_interleaved_noloop" , 20480 , 128 , 16 , true },
373
- {DATA_TYPE_INT8, 128 , 64 , kSM_86 , fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin ,
374
- fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin_len ,
397
+ {DATA_TYPE_INT8, 128 , 64 , kSM_86 , fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin ,
398
+ fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin_len ,
375
399
" fused_multihead_attention_v2_int8_128_64_kernel_sm80_noloop" , 20480 , 128 , 16 , false },
376
- {DATA_TYPE_INT8, 128 , 64 , kSM_86 , fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin ,
377
- fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin_len ,
400
+ {DATA_TYPE_INT8, 128 , 64 , kSM_86 , fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin ,
401
+ fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin_len ,
378
402
" fused_multihead_attention_v2_int8_128_64_kernel_sm80_interleaved" , 24576 , 128 , 0 , true },
379
- {DATA_TYPE_INT8, 128 , 64 , kSM_86 , fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin ,
380
- fused_multihead_attention_v2_int8_128_64_kernel_sm80_cubin_len ,
403
+ {DATA_TYPE_INT8, 128 , 64 , kSM_86 , fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin ,
404
+ fused_multihead_attention_v2_int8_128_64_kernel_sm86_cubin_len ,
381
405
" fused_multihead_attention_v2_int8_128_64_kernel_sm80" , 32768 , 128 , 0 , false },
382
- {DATA_TYPE_INT8, 192 , 64 , kSM_86 , fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin ,
383
- fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin_len ,
406
+ {DATA_TYPE_INT8, 192 , 64 , kSM_86 , fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin ,
407
+ fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin_len ,
384
408
" fused_multihead_attention_v2_int8_192_64_kernel_sm80_interleaved_noloop" , 28672 , 128 , 32 , true },
385
- {DATA_TYPE_INT8, 192 , 64 , kSM_86 , fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin ,
386
- fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin_len ,
409
+ {DATA_TYPE_INT8, 192 , 64 , kSM_86 , fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin ,
410
+ fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin_len ,
387
411
" fused_multihead_attention_v2_int8_192_64_kernel_sm80_noloop" , 28672 , 128 , 32 , false },
388
- {DATA_TYPE_INT8, 192 , 64 , kSM_86 , fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin ,
389
- fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin_len ,
412
+ {DATA_TYPE_INT8, 192 , 64 , kSM_86 , fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin ,
413
+ fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin_len ,
390
414
" fused_multihead_attention_v2_int8_192_64_kernel_sm80_interleaved" , 32768 , 128 , 0 , true },
391
- {DATA_TYPE_INT8, 192 , 64 , kSM_86 , fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin ,
392
- fused_multihead_attention_v2_int8_192_64_kernel_sm80_cubin_len ,
415
+ {DATA_TYPE_INT8, 192 , 64 , kSM_86 , fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin ,
416
+ fused_multihead_attention_v2_int8_192_64_kernel_sm86_cubin_len ,
393
417
" fused_multihead_attention_v2_int8_192_64_kernel_sm80" , 32768 , 128 , 0 , false },
394
- {DATA_TYPE_INT8, 256 , 64 , kSM_86 , fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin ,
395
- fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin_len ,
418
+ {DATA_TYPE_INT8, 256 , 64 , kSM_86 , fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin ,
419
+ fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin_len ,
396
420
" fused_multihead_attention_v2_int8_256_64_kernel_sm80_interleaved_noloop" , 36864 , 128 , 32 , true },
397
- {DATA_TYPE_INT8, 256 , 64 , kSM_86 , fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin ,
398
- fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin_len ,
421
+ {DATA_TYPE_INT8, 256 , 64 , kSM_86 , fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin ,
422
+ fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin_len ,
399
423
" fused_multihead_attention_v2_int8_256_64_kernel_sm80_noloop" , 36864 , 128 , 32 , false },
400
- {DATA_TYPE_INT8, 256 , 64 , kSM_86 , fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin ,
401
- fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin_len ,
424
+ {DATA_TYPE_INT8, 256 , 64 , kSM_86 , fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin ,
425
+ fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin_len ,
402
426
" fused_multihead_attention_v2_int8_256_64_kernel_sm80_interleaved" , 36864 , 128 , 0 , true },
403
- {DATA_TYPE_INT8, 256 , 64 , kSM_86 , fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin ,
404
- fused_multihead_attention_v2_int8_256_64_kernel_sm80_cubin_len ,
427
+ {DATA_TYPE_INT8, 256 , 64 , kSM_86 , fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin ,
428
+ fused_multihead_attention_v2_int8_256_64_kernel_sm86_cubin_len ,
405
429
" fused_multihead_attention_v2_int8_256_64_kernel_sm80" , 36864 , 128 , 0 , false },
406
- {DATA_TYPE_INT8, 384 , 64 , kSM_86 , fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin ,
407
- fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin_len ,
430
+ {DATA_TYPE_INT8, 384 , 64 , kSM_86 , fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin ,
431
+ fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin_len ,
408
432
" fused_multihead_attention_v2_int8_384_64_kernel_sm80_interleaved_noloop" , 53248 , 128 , 32 , true },
409
- {DATA_TYPE_INT8, 384 , 64 , kSM_86 , fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin ,
410
- fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin_len ,
433
+ {DATA_TYPE_INT8, 384 , 64 , kSM_86 , fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin ,
434
+ fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin_len ,
411
435
" fused_multihead_attention_v2_int8_384_64_kernel_sm80_noloop" , 53248 , 128 , 32 , false },
412
- {DATA_TYPE_INT8, 384 , 64 , kSM_86 , fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin ,
413
- fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin_len ,
436
+ {DATA_TYPE_INT8, 384 , 64 , kSM_86 , fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin ,
437
+ fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin_len ,
414
438
" fused_multihead_attention_v2_int8_384_64_kernel_sm80_interleaved" , 51200 , 128 , 0 , true },
415
- {DATA_TYPE_INT8, 384 , 64 , kSM_86 , fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin ,
416
- fused_multihead_attention_v2_int8_384_64_kernel_sm80_cubin_len ,
439
+ {DATA_TYPE_INT8, 384 , 64 , kSM_86 , fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin ,
440
+ fused_multihead_attention_v2_int8_384_64_kernel_sm86_cubin_len ,
417
441
" fused_multihead_attention_v2_int8_384_64_kernel_sm80" , 53248 , 128 , 0 , false },
418
442
#endif
419
443
};
0 commit comments