Skip to content

Commit 9151360

Browse files
committed
Merge remote-tracking branch 'origin' into kylesayrs/bert-SupportsQuant
2 parents 673cc9d + a79cc68 commit 9151360

File tree

358 files changed

+16078
-4958
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

358 files changed

+16078
-4958
lines changed

.buildkite/nightly-benchmarks/tests/serving-tests.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,12 @@
6363
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
6464
"disable_log_requests": "",
6565
"tensor_parallel_size": 4,
66-
"swap_space": 16,
67-
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
68-
"num_speculative_tokens": 4,
69-
"speculative_draft_tensor_parallel_size": 1
66+
"swap_space": 16,
67+
"speculative_config": {
68+
"model": "turboderp/Qwama-0.5B-Instruct",
69+
"num_speculative_tokens": 4,
70+
"draft_tensor_parallel_size": 1
71+
}
7072
},
7173
"client_parameters": {
7274
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

.buildkite/release-pipeline.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ steps:
33
agents:
44
queue: cpu_queue_postmerge
55
commands:
6-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
6+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
77
- "mkdir artifacts"
88
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
99
- "bash .buildkite/upload-wheels.sh"
@@ -14,7 +14,7 @@ steps:
1414
agents:
1515
queue: cpu_queue_postmerge
1616
commands:
17-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
17+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
1818
- "mkdir artifacts"
1919
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
2020
- "bash .buildkite/upload-wheels.sh"
@@ -31,7 +31,7 @@ steps:
3131
agents:
3232
queue: cpu_queue_postmerge
3333
commands:
34-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
34+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
3535
- "mkdir artifacts"
3636
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
3737
- "bash .buildkite/upload-wheels.sh"
@@ -48,7 +48,7 @@ steps:
4848
queue: cpu_queue_postmerge
4949
commands:
5050
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
51-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
51+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
5252
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
5353

5454
- label: "Build and publish TPU release image"
@@ -57,7 +57,7 @@ steps:
5757
agents:
5858
queue: tpu_queue_postmerge
5959
commands:
60-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
60+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
6161
- "docker push vllm/vllm-tpu:nightly"
6262
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
6363
plugins:
@@ -82,7 +82,7 @@ steps:
8282
queue: cpu_queue_postmerge
8383
commands:
8484
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
85-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
85+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
8686
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
8787
env:
8888
DOCKER_BUILDKIT: "1"

.buildkite/run-amd-test.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,10 @@ if [[ $commands == *"--shard-id="* ]]; then
134134
# assign shard-id for each shard
135135
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
136136
echo "Shard ${GPU} commands:$commands_gpu"
137+
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
137138
docker run \
138-
--device /dev/kfd --device /dev/dri \
139-
--network host \
139+
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
140+
--network=host \
140141
--shm-size=16gb \
141142
--rm \
142143
-e HIP_VISIBLE_DEVICES="${GPU}" \
@@ -163,9 +164,10 @@ if [[ $commands == *"--shard-id="* ]]; then
163164
fi
164165
done
165166
else
167+
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
166168
docker run \
167-
--device /dev/kfd --device /dev/dri \
168-
--network host \
169+
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
170+
--network=host \
169171
--shm-size=16gb \
170172
--rm \
171173
-e HIP_VISIBLE_DEVICES=0 \

.buildkite/run-cpu-test-ppc64le.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ trap remove_docker_container EXIT
1010
remove_docker_container
1111

1212
# Try building the docker image
13-
docker build -t cpu-test -f Dockerfile.ppc64le .
13+
docker build -t cpu-test -f docker/Dockerfile.ppc64le .
1414

.buildkite/run-cpu-test.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,19 @@ set -ex
88
CORE_RANGE=${CORE_RANGE:-48-95}
99
NUMA_NODE=${NUMA_NODE:-1}
1010

11-
# Try building the docker image
12-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
13-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
14-
1511
# Setup cleanup
16-
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
12+
remove_docker_container() {
13+
set -e;
14+
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
15+
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
16+
}
1717
trap remove_docker_container EXIT
1818
remove_docker_container
1919

20+
# Try building the docker image
21+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
22+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
23+
2024
# Run the image, setting --shm-size=4g for tensor parallel.
2125
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
2226
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@@ -36,8 +40,6 @@ function cpu_tests() {
3640
# Run basic model test
3741
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3842
set -e
39-
pip install -r vllm/requirements/test.txt
40-
pip install -r vllm/requirements/cpu.txt
4143
pytest -v -s tests/kernels/test_cache.py -m cpu_model
4244
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
4345
pytest -v -s tests/models/decoder_only/language -m cpu_model

.buildkite/run-gh200-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ python3 use_existing_torch.py
99

1010
# Try building the docker image
1111
DOCKER_BUILDKIT=1 docker build . \
12+
--file docker/Dockerfile \
1213
--target vllm-openai \
1314
--platform "linux/arm64" \
1415
-t gh200-test \

.buildkite/run-hpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
set -ex
66

77
# Try building the docker image
8-
docker build -t hpu-test-env -f Dockerfile.hpu .
8+
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
99

1010
# Setup cleanup
1111
# certain versions of HPU software stack have a bug that can

.buildkite/run-neuron-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ else
3535
date "+%s" > /tmp/neuron-docker-build-timestamp
3636
fi
3737

38-
docker build -t "${image_name}" -f Dockerfile.neuron .
38+
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
3939

4040
# Setup cleanup
4141
remove_docker_container() {

.buildkite/run-tpu-v1-test.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
set -e
44

55
# Build the docker image.
6-
docker build -f Dockerfile.tpu -t vllm-tpu .
6+
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
77

88
# Set up cleanup.
99
remove_docker_container() { docker rm -f tpu-test || true; }
@@ -21,8 +21,10 @@ docker run --privileged --net host --shm-size=16G -it \
2121
&& python3 -m pip install lm_eval[api]==0.4.4 \
2222
&& export VLLM_USE_V1=1 \
2323
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
24+
&& echo TEST_0 \
25+
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
2426
&& echo TEST_1 \
25-
&& pytest /workspace/vllm/tests/tpu/test_compilation.py \
27+
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
2628
&& echo TEST_2 \
2729
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
2830
&& echo TEST_3 \
@@ -32,9 +34,10 @@ docker run --privileged --net host --shm-size=16G -it \
3234
&& echo TEST_5 \
3335
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
3436
&& echo TEST_6 \
35-
&& pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
37+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
38+
&& echo TEST_7 \
39+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
3640

3741

3842
# TODO: This test fails because it uses RANDOM_SEED sampling
3943
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
40-

.buildkite/run-xpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
88
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
99

1010
# Try building the docker image
11-
docker build -t ${image_name} -f Dockerfile.xpu .
11+
docker build -t ${image_name} -f docker/Dockerfile.xpu .
1212

1313
# Setup cleanup
1414
remove_docker_container() {

.buildkite/test-pipeline.yaml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,21 +135,23 @@ steps:
135135
- examples/offline_inference/rlhf.py
136136
- examples/offline_inference/rlhf_colocate.py
137137
- tests/examples/offline_inference/data_parallel.py
138+
- tests/v1/test_async_llm_dp.py
138139
commands:
139140
# test with tp=2 and external_dp=2
140141
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
141142
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
142143
# test with internal dp
143144
- python3 ../examples/offline_inference/data_parallel.py
145+
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
144146
- pytest -v -s distributed/test_utils.py
145147
- pytest -v -s compile/test_basic_correctness.py
146148
- pytest -v -s distributed/test_pynccl.py
147149
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
148150
# TODO: create a dedicated test section for multi-GPU example tests
149151
# when we have multiple distributed example tests
150152
- pushd ../examples/offline_inference
151-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
152-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
153+
- python3 rlhf.py
154+
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
153155
- popd
154156

155157
- label: Metrics, Tracing Test # 10min
@@ -429,6 +431,7 @@ steps:
429431
- pytest -v -s models/encoder_decoder/audio_language -m core_model
430432
- pytest -v -s models/encoder_decoder/language -m core_model
431433
- pytest -v -s models/encoder_decoder/vision_language -m core_model
434+
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
432435

433436
- label: Multi-Modal Models Test (Extended) 1 # 48m
434437
optional: true
@@ -514,8 +517,11 @@ steps:
514517
- vllm/worker/worker.py
515518
- vllm/worker/model_runner.py
516519
- entrypoints/llm/test_collective_rpc.py
520+
- tests/v1/test_async_llm_dp.py
521+
- vllm/v1/engine/
517522
commands:
518-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
523+
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
524+
- pytest -v -s entrypoints/llm/test_collective_rpc.py
519525
- pytest -v -s ./compile/test_basic_correctness.py
520526
- pytest -v -s ./compile/test_wrapper.py
521527
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'

.github/mergify.yml

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pull_request_rules:
1919
- files~=\.buildkite/
2020
- files~=^cmake/
2121
- files=CMakeLists.txt
22-
- files~=^Dockerfile
22+
- files~=^docker/Dockerfile
2323
- files~=^requirements.*\.txt
2424
- files=setup.py
2525
actions:
@@ -88,6 +88,36 @@ pull_request_rules:
8888
add:
8989
- v1
9090

91+
- name: label-tpu
92+
description: Automatically apply tpu label
93+
# Keep this list in sync with `label-tpu-remove` conditions
94+
conditions:
95+
- or:
96+
- files~=tpu.py
97+
- files~=_tpu
98+
- files~=tpu_
99+
- files~=/tpu/
100+
- files~=pallas
101+
actions:
102+
label:
103+
add:
104+
- tpu
105+
106+
- name: label-tpu-remove
107+
description: Automatically remove tpu label
108+
# Keep this list in sync with `label-tpu` conditions
109+
conditions:
110+
- and:
111+
- -files~=tpu.py
112+
- -files~=_tpu
113+
- -files~=tpu_
114+
- -files~=/tpu/
115+
- -files~=pallas
116+
actions:
117+
label:
118+
remove:
119+
- tpu
120+
91121
- name: ping author on conflicts and add 'needs-rebase' label
92122
conditions:
93123
- conflict

.github/workflows/lint-and-deploy.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ jobs:
5050
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
5151

5252
- name: Build the Docker image vllm cpu
53-
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
53+
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
5454

5555
- name: Configuration of docker images, network and namespace for the kind cluster
5656
run: |

.pre-commit-config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
default_install_hook_types:
2+
- pre-commit
3+
- commit-msg
14
default_stages:
25
- pre-commit # Run locally
36
- manual # Run in CI

CMakeLists.txt

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
3434
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
3535

3636
# Supported AMD GPU architectures.
37-
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
37+
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
3838

3939
#
4040
# Supported/expected torch versions for CUDA/ROCm.
@@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
4444
#
4545
# Note: the CUDA torch version is derived from pyproject.toml and various
4646
# requirements.txt files and should be kept consistent. The ROCm torch
47-
# versions are derived from Dockerfile.rocm
47+
# versions are derived from docker/Dockerfile.rocm
4848
#
4949
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
5050
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
@@ -234,13 +234,15 @@ set(VLLM_EXT_SRC
234234
"csrc/activation_kernels.cu"
235235
"csrc/layernorm_kernels.cu"
236236
"csrc/layernorm_quant_kernels.cu"
237+
"csrc/cuda_view.cu"
237238
"csrc/quantization/gptq/q_gemm.cu"
238239
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
239240
"csrc/quantization/fp8/common.cu"
240241
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
241242
"csrc/quantization/gguf/gguf_kernel.cu"
242243
"csrc/cuda_utils_kernels.cu"
243244
"csrc/prepare_inputs/advance_step.cu"
245+
"csrc/custom_all_reduce.cu"
244246
"csrc/torch_bindings.cpp")
245247

246248
if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -282,7 +284,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
282284
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
283285
"csrc/quantization/aqlm/gemm_kernels.cu"
284286
"csrc/quantization/awq/gemm_kernels.cu"
285-
"csrc/custom_all_reduce.cu"
286287
"csrc/permute_cols.cu"
287288
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
288289
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
@@ -461,6 +462,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
461462
set(FP4_ARCHS)
462463
endif()
463464

465+
#
466+
# CUTLASS MoE kernels
467+
468+
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
469+
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
470+
# to compile MoE kernels that use its output.
471+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
472+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
473+
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
474+
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
475+
set_gencode_flags_for_srcs(
476+
SRCS "${SRCS}"
477+
CUDA_ARCHS "${SCALED_MM_ARCHS}")
478+
list(APPEND VLLM_EXT_SRC "${SRCS}")
479+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
480+
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
481+
else()
482+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
483+
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
484+
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
485+
"if you intend on running FP8 quantized MoE models on Hopper.")
486+
else()
487+
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
488+
"in CUDA target architectures")
489+
endif()
490+
endif()
491+
464492
#
465493
# Machete kernels
466494

0 commit comments

Comments
 (0)