@@ -31,7 +31,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
31
31
set (PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" )
32
32
33
33
# Supported NVIDIA architectures.
34
- set (CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0" )
34
+ set (CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0 " )
35
35
36
36
# Supported AMD GPU architectures.
37
37
set (HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201" )
@@ -312,7 +312,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
312
312
# Only build Marlin kernels if we are building for at least some compatible archs.
313
313
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
314
314
# are not supported by Machete yet.
315
- cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS} " )
315
+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0 " "${CUDA_ARCHS} " )
316
316
if (MARLIN_ARCHS)
317
317
set (MARLIN_SRCS
318
318
"csrc/quantization/fp8/fp8_marlin.cu"
@@ -334,7 +334,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
334
334
335
335
# Only build AllSpark kernels if we are building for at least some compatible archs.
336
336
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS} " )
337
- if (ALLSPARK_ARCHS)
337
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
338
338
set (ALLSPARK_SRCS
339
339
"csrc/quantization/gptq_allspark/allspark_repack.cu"
340
340
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu" )
@@ -345,46 +345,74 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
345
345
message (STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS} " )
346
346
else ()
347
347
message (STATUS "Not building AllSpark kernels as no compatible archs found"
348
- " in CUDA target architectures" )
348
+ " in CUDA target architectures, or CUDA not >= 12.0 " )
349
349
endif ()
350
350
351
+
352
+ set (SCALED_MM_3X_ARCHS)
351
353
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
352
- # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
353
- cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS} " )
354
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS )
354
+ # CUDA 12.0 or later
355
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a; " "${CUDA_ARCHS} " )
356
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS )
355
357
set (SRCS
356
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x .cu"
358
+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90 .cu"
357
359
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
358
360
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
359
361
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
360
362
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu" )
361
363
set_gencode_flags_for_srcs(
362
364
SRCS "${SRCS} "
363
- CUDA_ARCHS "${SCALED_MM_3X_ARCHS } " )
365
+ CUDA_ARCHS "${SCALED_MM_ARCHS } " )
364
366
list (APPEND VLLM_EXT_SRC "${SRCS} " )
365
- list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1" )
366
- message (STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS} " )
367
+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1" )
368
+ # Let scaled_mm_c2x know it doesn't need to build these arches
369
+ list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
370
+ message (STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS} " )
367
371
else ()
368
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS )
369
- message (STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
372
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS )
373
+ message (STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
370
374
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
371
375
"later if you intend on running FP8 quantized models on "
372
376
"Hopper." )
373
377
else ()
374
- message (STATUS "Not building scaled_mm_c3x as no compatible archs found "
378
+ message (STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
375
379
"in CUDA target architectures" )
376
380
endif ()
381
+ endif ()
377
382
378
- # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
379
- # build any 3x kernels
380
- set (SCALED_MM_3X_ARCHS)
383
+ # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
384
+ # CUDA 12.8 or later
385
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS} " )
386
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
387
+ set (SRCS
388
+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
389
+ "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
390
+ )
391
+ set_gencode_flags_for_srcs(
392
+ SRCS "${SRCS} "
393
+ CUDA_ARCHS "${SCALED_MM_ARCHS} " )
394
+ list (APPEND VLLM_EXT_SRC "${SRCS} " )
395
+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1" )
396
+ # Let scaled_mm_c2x know it doesn't need to build these arches
397
+ list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
398
+ message (STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS} " )
399
+ else ()
400
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
401
+ message (STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
402
+ "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
403
+ "later if you intend on running FP8 quantized models on "
404
+ "Blackwell." )
405
+ else ()
406
+ message (STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
407
+ "in CUDA target architectures" )
408
+ endif ()
381
409
endif ()
382
410
383
411
#
384
412
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
385
413
# kernels for the remaining archs that are not already built for 3x.
386
414
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
387
- "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS} " )
415
+ "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0 " "${CUDA_ARCHS} " )
388
416
# subtract out the archs that are already built for 3x
389
417
list (REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS} )
390
418
if (SCALED_MM_2X_ARCHS)
@@ -409,17 +437,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
409
437
# 2:4 Sparse Kernels
410
438
411
439
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
412
- # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now ).
413
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS )
440
+ # require CUDA 12.2 or later (and only work on Hopper and Blackwell ).
441
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS )
414
442
set (SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu" )
415
443
set_gencode_flags_for_srcs(
416
444
SRCS "${SRCS} "
417
- CUDA_ARCHS "${SCALED_MM_3X_ARCHS } " )
445
+ CUDA_ARCHS "${SCALED_MM_ARCHS } " )
418
446
list (APPEND VLLM_EXT_SRC "${SRCS} " )
419
447
list (APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1" )
420
- message (STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS } " )
448
+ message (STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS } " )
421
449
else ()
422
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS )
450
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS )
423
451
message (STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
424
452
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
425
453
"if you intend on running FP8 sparse quantized models on Hopper." )
@@ -434,8 +462,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
434
462
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
435
463
set (SRCS
436
464
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
437
- "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
438
- )
465
+ "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" )
439
466
set_gencode_flags_for_srcs(
440
467
SRCS "${SRCS} "
441
468
CUDA_ARCHS "${FP4_ARCHS} " )
@@ -534,6 +561,7 @@ define_gpu_extension_target(
534
561
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
535
562
ARCHITECTURES ${VLLM_GPU_ARCHES}
536
563
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
564
+ INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
537
565
USE_SABI 3
538
566
WITH_SOABI)
539
567
@@ -557,7 +585,7 @@ set_gencode_flags_for_srcs(
557
585
CUDA_ARCHS "${CUDA_ARCHS} " )
558
586
559
587
if (VLLM_GPU_LANG STREQUAL "CUDA" )
560
- cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS} " )
588
+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0 " "${CUDA_ARCHS} " )
561
589
if (MARLIN_MOE_ARCHS)
562
590
set (MARLIN_MOE_SRC
563
591
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
0 commit comments