Skip to content

Commit 88e020d

Browse files
authored
Merge pull request #350 from ROCm/upstream_merge_25_1_6
Upstream merge 25 1 6
2 parents 2053351 + 97067c0 commit 88e020d

File tree

555 files changed

+31168
-16735
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

555 files changed

+31168
-16735
lines changed

.buildkite/generate_index.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import argparse
2+
import os
3+
4+
template = """<!DOCTYPE html>
5+
<html>
6+
<body>
7+
<h1>Links for vLLM</h1/>
8+
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
9+
</body>
10+
</html>
11+
"""
12+
13+
parser = argparse.ArgumentParser()
14+
parser.add_argument("--wheel", help="The wheel path.", required=True)
15+
args = parser.parse_args()
16+
17+
filename = os.path.basename(args.wheel)
18+
19+
with open("index.html", "w") as f:
20+
print(f"Generated index.html for {args.wheel}")
21+
# cloudfront requires escaping the '+' character
22+
f.write(
23+
template.format(wheel=filename,
24+
wheel_html_escaped=filename.replace("+", "%2B")))

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
steps:
22
- label: "Wait for container to be ready"
3+
key: wait-for-container-image
34
agents:
45
queue: A100
56
plugins:
@@ -10,12 +11,11 @@ steps:
1011
command:
1112
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
1213

13-
- wait
14-
1514
- label: "A100"
1615
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
1716
agents:
1817
queue: A100
18+
depends_on: wait-for-container-image
1919
plugins:
2020
- kubernetes:
2121
podSpec:
@@ -49,6 +49,7 @@ steps:
4949
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
5050
agents:
5151
queue: H200
52+
depends_on: wait-for-container-image
5253
plugins:
5354
- docker#v5.12.0:
5455
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@@ -65,15 +66,15 @@ steps:
6566
- VLLM_USAGE_SOURCE
6667
- HF_TOKEN
6768

68-
- block: "Run H100 Benchmark"
69-
key: block-h100
70-
depends_on: ~
69+
#- block: "Run H100 Benchmark"
70+
#key: block-h100
71+
#depends_on: ~
7172

7273
- label: "H100"
7374
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
7475
agents:
7576
queue: H100
76-
depends_on: block-h100
77+
depends_on: wait-for-container-image
7778
plugins:
7879
- docker#v5.12.0:
7980
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

.buildkite/release-pipeline.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,18 @@ steps:
5555
password-env: DOCKERHUB_TOKEN
5656
env:
5757
DOCKER_BUILDKIT: "1"
58+
59+
- block: "Build CPU release image"
60+
key: block-cpu-release-image-build
61+
depends_on: ~
62+
63+
- label: "Build and publish CPU release image"
64+
depends_on: block-cpu-release-image-build
65+
agents:
66+
queue: cpu_queue_postmerge
67+
commands:
68+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
69+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
70+
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
71+
env:
72+
DOCKER_BUILDKIT: "1"

.buildkite/run-gh200-test.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
3+
# This script build the GH200 docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
8+
python3 use_existing_torch.py
9+
10+
# Try building the docker image
11+
DOCKER_BUILDKIT=1 docker build . \
12+
--target vllm-openai \
13+
--platform "linux/arm64" \
14+
-t gh200-test \
15+
--build-arg max_jobs=66 \
16+
--build-arg nvcc_threads=2 \
17+
--build-arg torch_cuda_arch_list="9.0+PTX" \
18+
--build-arg vllm_fa_cmake_gpu_arches="90-real"
19+
20+
# Setup cleanup
21+
remove_docker_container() { docker rm -f gh200-test || true; }
22+
trap remove_docker_container EXIT
23+
remove_docker_container
24+
25+
# Run the image and test offline inference
26+
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
27+
python3 examples/offline_inference.py
28+
'

.buildkite/test-pipeline.yaml

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,12 @@ steps:
106106
source_file_dependencies:
107107
- vllm/
108108
commands:
109-
- pip install -e ./plugins/vllm_add_dummy_model
110109
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
111110
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
112111
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
113112
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
114113
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
115114
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
116-
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
117115
- pytest -v -s entrypoints/test_chat_utils.py
118116
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
119117

@@ -201,7 +199,7 @@ steps:
201199
- python3 offline_inference_classification.py
202200
- python3 offline_inference_embedding.py
203201
- python3 offline_inference_scoring.py
204-
- python3 offline_profile.py --model facebook/opt-125m
202+
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
205203

206204
- label: Prefix Caching Test # 9min
207205
mirror_hardwares: [amd]
@@ -224,8 +222,12 @@ steps:
224222
mirror_hardwares: [amd]
225223
source_file_dependencies:
226224
- vllm/model_executor/layers
225+
- vllm/model_executor/guided_decoding
227226
- tests/test_logits_processor
228-
command: pytest -v -s test_logits_processor.py
227+
- tests/model_executor/test_guided_processors
228+
commands:
229+
- pytest -v -s test_logits_processor.py
230+
- pytest -v -s model_executor/test_guided_processors.py
229231

230232
- label: Speculative decoding tests # 30min
231233
source_file_dependencies:
@@ -329,8 +331,6 @@ steps:
329331
- vllm/
330332
- tests/models
331333
commands:
332-
- pip install -e ./plugins/vllm_add_dummy_model
333-
- pytest -v -s models/test_oot_registration.py # it needs a clean process
334334
- pytest -v -s models/test_registry.py
335335
- pytest -v -s models/test_initialization.py
336336

@@ -356,23 +356,25 @@ steps:
356356
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
357357
- pytest -v -s models/embedding/language -m 'not core_model'
358358

359-
- label: Multi-Modal Models Test (Standard) # 28min
359+
- label: Multi-Modal Models Test (Standard) # 40min
360360
#mirror_hardwares: [amd]
361361
source_file_dependencies:
362362
- vllm/
363363
- tests/models/decoder_only/audio_language
364364
- tests/models/decoder_only/vision_language
365365
- tests/models/embedding/vision_language
366+
- tests/models/encoder_decoder/audio_language
366367
- tests/models/encoder_decoder/vision_language
367368
commands:
368369
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
369370
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
370371
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
371372
- pytest -v -s models/embedding/vision_language -m core_model
373+
- pytest -v -s models/encoder_decoder/audio_language -m core_model
372374
- pytest -v -s models/encoder_decoder/language -m core_model
373375
- pytest -v -s models/encoder_decoder/vision_language -m core_model
374376

375-
- label: Multi-Modal Models Test (Extended) 1 # 1h16m
377+
- label: Multi-Modal Models Test (Extended) 1 # 48m
376378
optional: true
377379
source_file_dependencies:
378380
- vllm/
@@ -465,11 +467,28 @@ steps:
465467
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
466468
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
467469
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
468-
- pip install -e ./plugins/vllm_add_dummy_model
469-
- pytest -v -s distributed/test_distributed_oot.py
470470
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
471471
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
472472

473+
- label: Plugin Tests (2 GPUs) # 40min
474+
working_dir: "/vllm-workspace/tests"
475+
num_gpus: 2
476+
fast_check: true
477+
source_file_dependencies:
478+
- vllm/plugins/
479+
- tests/plugins/
480+
commands:
481+
# begin platform plugin tests, all the code in-between runs on dummy platform
482+
- pip install -e ./plugins/vllm_add_dummy_platform
483+
- pytest -v -s plugins_tests/test_platform_plugins.py
484+
- pip uninstall vllm_add_dummy_platform -y
485+
# end platform plugin tests
486+
# other tests continue here:
487+
- pip install -e ./plugins/vllm_add_dummy_model
488+
- pytest -v -s distributed/test_distributed_oot.py
489+
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
490+
- pytest -v -s models/test_oot_registration.py # it needs a clean process
491+
473492
- label: Multi-step Tests (4 GPUs) # 36min
474493
working_dir: "/vllm-workspace/tests"
475494
num_gpus: 4

.buildkite/upload-wheels.sh

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ wheel="$new_wheel"
2323
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
2424
echo "Version: $version"
2525

26+
normal_wheel="$wheel" # Save the original wheel filename
27+
2628
# If the version contains "dev", rename it to v1.0.0.dev for consistency
2729
if [[ $version == *dev* ]]; then
2830
suffix="${version##*.}"
@@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
3234
new_version="1.0.0.dev"
3335
fi
3436
new_wheel="${wheel/$version/$new_version}"
35-
mv -- "$wheel" "$new_wheel"
37+
# use cp to keep both files in the artifacts directory
38+
cp -- "$wheel" "$new_wheel"
3639
wheel="$new_wheel"
3740
version="$new_version"
3841
fi
3942

4043
# Upload the wheel to S3
44+
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
45+
46+
# generate index for this commit
4147
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
48+
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
49+
50+
if [[ $normal_wheel == *"cu118"* ]]; then
51+
# if $normal_wheel matches cu118, do not upload the index.html
52+
echo "Skipping index files for cu118 wheels"
53+
else
54+
# only upload index.html for cu12 wheels (default wheels)
55+
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
56+
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
57+
fi
58+
59+
# generate index for nightly
4260
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
61+
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
62+
63+
if [[ $normal_wheel == *"cu118"* ]]; then
64+
# if $normal_wheel matches cu118, do not upload the index.html
65+
echo "Skipping index files for cu118 wheels"
66+
else
67+
# only upload index.html for cu12 wheels (default wheels)
68+
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
69+
fi
70+
4371
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"

.github/ISSUE_TEMPLATE/600-new model.yml renamed to .github/ISSUE_TEMPLATE/600-new-model.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ body:
99
value: >
1010
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
1111
12-
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
12+
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
1313
- type: textarea
1414
attributes:
1515
label: The model to consider.

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ instance/
8181
docs/_build/
8282
docs/source/getting_started/examples/*.rst
8383
!**/*.template.rst
84+
docs/source/getting_started/examples/*.md
85+
!**/*.template.md
8486

8587
# PyBuilder
8688
.pybuilder/

CMakeLists.txt

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
240240
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
241241

242242
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
243-
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
243+
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
244244

245245
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
246246
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -257,7 +257,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
257257
FetchContent_Declare(
258258
cutlass
259259
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
260-
GIT_TAG v3.5.1
260+
GIT_TAG v3.6.0
261261
GIT_PROGRESS TRUE
262262

263263
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -275,7 +275,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
275275
"csrc/quantization/awq/gemm_kernels.cu"
276276
"csrc/custom_all_reduce.cu"
277277
"csrc/permute_cols.cu"
278-
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
278+
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
279+
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
280+
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
281+
"csrc/cutlass_extensions/common.cpp")
279282

280283
set_gencode_flags_for_srcs(
281284
SRCS "${VLLM_EXT_SRC}"
@@ -304,7 +307,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
304307
" in CUDA target architectures")
305308
endif()
306309

307-
#
308310
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
309311
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
310312
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
@@ -357,6 +359,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
357359
endif()
358360
endif()
359361

362+
#
363+
# 2:4 Sparse Kernels
364+
365+
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
366+
# require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
367+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
368+
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
369+
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
370+
set_gencode_flags_for_srcs(
371+
SRCS "${SRCS}"
372+
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
373+
list(APPEND VLLM_EXT_SRC "${SRCS}")
374+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
375+
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
376+
else()
377+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
378+
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
379+
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
380+
"if you intend on running FP8 sparse quantized models on Hopper.")
381+
else()
382+
message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
383+
"in CUDA target architectures")
384+
endif()
385+
endif()
386+
360387

361388
#
362389
# Machete kernels
@@ -443,7 +470,7 @@ define_gpu_extension_target(
443470
SOURCES ${VLLM_EXT_SRC}
444471
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
445472
ARCHITECTURES ${VLLM_GPU_ARCHES}
446-
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
473+
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
447474
USE_SABI 3
448475
WITH_SOABI)
449476

@@ -583,7 +610,7 @@ else()
583610
FetchContent_Declare(
584611
vllm-flash-attn
585612
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
586-
GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
613+
GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
587614
GIT_PROGRESS TRUE
588615
# Don't share the vllm-flash-attn build between build types
589616
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

0 commit comments

Comments
 (0)