Skip to content

Commit 7175a33

Browse files
authored
Merge branch 'main' into ray_dp_scale_out
2 parents 064a94d + 980a172 commit 7175a33

File tree

428 files changed

+12741
-5219
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

428 files changed

+12741
-5219
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ steps:
1414
agents:
1515
queue: cpu_queue_postmerge
1616
commands:
17-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
17+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
1818
- "mkdir artifacts"
1919
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
2020
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -31,7 +31,7 @@ steps:
3131
agents:
3232
queue: cpu_queue_postmerge
3333
commands:
34-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
34+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
3535
- "mkdir artifacts"
3636
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
3737
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -64,7 +64,7 @@ steps:
6464
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
6565
plugins:
6666
- docker-login#v3.0.0:
67-
username: vllm
67+
username: vllmbot
6868
password-env: DOCKERHUB_TOKEN
6969
env:
7070
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,14 @@ if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"*
8282
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
8383
fi
8484

85+
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
86+
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
87+
fi
88+
89+
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
90+
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
91+
fi
92+
8593
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
8694
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
8795
fi

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
1111
HF_CACHE="$(realpath ~)/huggingface"
1212
mkdir -p "${HF_CACHE}"
1313
HF_MOUNT="/root/.cache/huggingface"
14+
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
1415

1516
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
1617
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
1718
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
1819

1920
# Try building the docker image
20-
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
21+
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
2122

2223
# prune old image and containers to save disk space, and only once a day
2324
# by using a timestamp file in tmp.
@@ -47,6 +48,7 @@ trap remove_docker_container EXIT
4748
docker run --rm -it --device=/dev/neuron0 --network bridge \
4849
-v "${HF_CACHE}:${HF_MOUNT}" \
4950
-e "HF_HOME=${HF_MOUNT}" \
51+
-e "HF_TOKEN=${HF_TOKEN}" \
5052
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
5153
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
5254
--name "${container_name}" \

.buildkite/scripts/upload-wheels.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,4 @@ else
7575
fi
7676

7777
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
78+
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"

.buildkite/test-pipeline.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ steps:
148148
# test with tp=2 and external_dp=2
149149
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
150150
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
151+
# test with tp=2 and pp=2
152+
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
151153
# test with internal dp
152154
- python3 ../examples/offline_inference/data_parallel.py
153155
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
@@ -216,7 +218,6 @@ steps:
216218
- pytest -v -s v1/spec_decode
217219
- pytest -v -s v1/kv_connector/unit
218220
- pytest -v -s v1/test_serial_utils.py
219-
- pytest -v -s v1/test_stats.py
220221
- pytest -v -s v1/test_utils.py
221222
- pytest -v -s v1/test_oracle.py
222223
# TODO: accuracy does not match, whether setting
@@ -309,6 +310,7 @@ steps:
309310
commands:
310311
- pytest -v -s compile/test_pass_manager.py
311312
- pytest -v -s compile/test_fusion.py
313+
- pytest -v -s compile/test_silu_mul_quant_fusion.py
312314
- pytest -v -s compile/test_sequence_parallelism.py
313315

314316
- label: PyTorch Fullgraph Smoke Test # 9min
@@ -379,7 +381,7 @@ steps:
379381
- pytest -v -s kernels/mamba
380382

381383
- label: Tensorizer Test # 11min
382-
mirror_hardwares: [amdexperimental]
384+
mirror_hardwares: [amdexperimental, amdproduction]
383385
soft_fail: true
384386
source_file_dependencies:
385387
- vllm/model_executor/model_loader
@@ -455,7 +457,7 @@ steps:
455457
##### models test #####
456458

457459
- label: Basic Models Test # 24min
458-
mirror_hardwares: [amdexperimental]
460+
mirror_hardwares: [amdexperimental, amdproduction]
459461
torch_nightly: true
460462
source_file_dependencies:
461463
- vllm/
@@ -527,7 +529,7 @@ steps:
527529
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
528530

529531
- label: Multi-Modal Models Test (Extended) 3
530-
mirror_hardwares: [amdexperimental]
532+
mirror_hardwares: [amdexperimental, amdproduction]
531533
optional: true
532534
source_file_dependencies:
533535
- vllm/
@@ -537,7 +539,7 @@ steps:
537539
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
538540

539541
- label: Quantized Models Test
540-
mirror_hardwares: [amdexperimental]
542+
mirror_hardwares: [amdexperimental, amdproduction]
541543
source_file_dependencies:
542544
- vllm/model_executor/layers/quantization
543545
- tests/models/quantization

.github/CODEOWNERS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
/vllm/model_executor/guided_decoding @mgoin @russellb
1414
/vllm/multimodal @DarkLight1337 @ywang96
1515
/vllm/vllm_flash_attn @LucasWilkinson
16+
/vllm/lora @jeejeelee
1617
CMakeLists.txt @tlrmchlsmth
1718

1819
# vLLM V1
@@ -40,3 +41,4 @@ CMakeLists.txt @tlrmchlsmth
4041
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
4142
/tests/v1/structured_output @mgoin @russellb
4243
/tests/weight_loading @mgoin @youkaichao
44+
/tests/lora @jeejeelee

.github/mergify.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,17 @@ pull_request_rules:
163163
164164
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
165165
166+
- name: assign reviewer for tensorizer changes
167+
conditions:
168+
- files~=^vllm/model_executor/model_loader/tensorizer.py
169+
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
170+
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
171+
- files~=^tests/tensorizer_loader/
172+
actions:
173+
assign:
174+
users:
175+
- "sangstar"
176+
166177
- name: remove 'needs-rebase' label when conflict is resolved
167178
conditions:
168179
- -conflict

CMakeLists.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
301301
# Only build Marlin kernels if we are building for at least some compatible archs.
302302
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
303303
# are not supported by Machete yet.
304-
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
304+
# 9.0 for latest bf16 atomicAdd PTX
305+
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
305306
if (MARLIN_ARCHS)
306307

307308
#
@@ -445,8 +446,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
445446
#
446447
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
447448
# kernels for the remaining archs that are not already built for 3x.
449+
# (Build 8.9 for FP8)
448450
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
449-
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
451+
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
450452
# subtract out the archs that are already built for 3x
451453
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
452454
if (SCALED_MM_2X_ARCHS)
@@ -675,7 +677,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
675677
CUDA_ARCHS "${CUDA_ARCHS}")
676678

677679
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
678-
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
680+
# 9.0 for latest bf16 atomicAdd PTX
681+
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
679682
if (MARLIN_MOE_ARCHS)
680683

681684
#

benchmarks/cutlass_benchmarks/w8a8_benchmarks.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,16 @@ def bench_fp8(
115115
a_cont = a.contiguous()
116116
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
117117
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
118-
block_scale_a = torch.rand((m, k // 128), device="cuda", dtype=torch.float32)
119-
block_scale_b = torch.rand((k // 128, n // 128), device="cuda", dtype=torch.float32)
118+
119+
def ceil_div(x: int, y: int) -> int:
120+
return (x + y - 1) // y
121+
122+
block_scale_a = torch.rand(
123+
(m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
124+
)
125+
block_scale_b = torch.rand(
126+
ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
127+
)
120128
block_scale_a_M_major = block_scale_a.t().contiguous().t()
121129
block_scale_b_K_major = block_scale_b.t().contiguous().t()
122130
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)

cmake/utils.cmake

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,26 @@ macro(set_gencode_flags_for_srcs)
228228
"${multiValueArgs}" ${ARGN} )
229229

230230
foreach(_ARCH ${arg_CUDA_ARCHS})
231-
string(REPLACE "." "" _ARCH "${_ARCH}")
232-
set_gencode_flag_for_srcs(
233-
SRCS ${arg_SRCS}
234-
ARCH "compute_${_ARCH}"
235-
CODE "sm_${_ARCH}")
231+
# handle +PTX suffix: generate both sm and ptx codes if requested
232+
string(FIND "${_ARCH}" "+PTX" _HAS_PTX)
233+
if(NOT _HAS_PTX EQUAL -1)
234+
string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}")
235+
string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}")
236+
set_gencode_flag_for_srcs(
237+
SRCS ${arg_SRCS}
238+
ARCH "compute_${_STRIPPED_ARCH}"
239+
CODE "sm_${_STRIPPED_ARCH}")
240+
set_gencode_flag_for_srcs(
241+
SRCS ${arg_SRCS}
242+
ARCH "compute_${_STRIPPED_ARCH}"
243+
CODE "compute_${_STRIPPED_ARCH}")
244+
else()
245+
string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}")
246+
set_gencode_flag_for_srcs(
247+
SRCS ${arg_SRCS}
248+
ARCH "compute_${_STRIPPED_ARCH}"
249+
CODE "sm_${_STRIPPED_ARCH}")
250+
endif()
236251
endforeach()
237252

238253
if (${arg_BUILD_PTX_FOR_ARCH})
@@ -251,7 +266,10 @@ endmacro()
251266
#
252267
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
253268
# `<major>.<minor>[letter]` compute the "loose intersection" with the
254-
# `TGT_CUDA_ARCHS` list of gencodes.
269+
# `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
270+
# `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
271+
# is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
272+
# architecture in `SRC_CUDA_ARCHS`.
255273
# The loose intersection is defined as:
256274
# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
257275
# where `<=` is the version comparison operator.
@@ -268,44 +286,63 @@ endmacro()
268286
# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
269287
# OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
270288
#
289+
# Example With PTX:
290+
# SRC_CUDA_ARCHS="8.0+PTX"
291+
# TGT_CUDA_ARCHS="9.0"
292+
# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
293+
# OUT_CUDA_ARCHS="8.0+PTX"
294+
#
271295
function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
272-
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
273-
set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
296+
set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}")
297+
set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS})
298+
299+
# handle +PTX suffix: separate base arch for matching, record PTX requests
300+
set(_PTX_ARCHS)
301+
foreach(_arch ${_SRC_CUDA_ARCHS})
302+
if(_arch MATCHES "\\+PTX$")
303+
string(REPLACE "+PTX" "" _base "${_arch}")
304+
list(APPEND _PTX_ARCHS "${_base}")
305+
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
306+
list(APPEND _SRC_CUDA_ARCHS "${_base}")
307+
endif()
308+
endforeach()
309+
list(REMOVE_DUPLICATES _PTX_ARCHS)
310+
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
274311

275312
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
276313
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
277314
set(_CUDA_ARCHS)
278-
if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
279-
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
280-
if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
281-
list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
315+
if ("9.0a" IN_LIST _SRC_CUDA_ARCHS)
316+
list(REMOVE_ITEM _SRC_CUDA_ARCHS "9.0a")
317+
if ("9.0" IN_LIST TGT_CUDA_ARCHS)
318+
list(REMOVE_ITEM _TGT_CUDA_ARCHS "9.0")
282319
set(_CUDA_ARCHS "9.0a")
283320
endif()
284321
endif()
285322

286-
if ("10.0a" IN_LIST SRC_CUDA_ARCHS)
287-
list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a")
323+
if ("10.0a" IN_LIST _SRC_CUDA_ARCHS)
324+
list(REMOVE_ITEM _SRC_CUDA_ARCHS "10.0a")
288325
if ("10.0" IN_LIST TGT_CUDA_ARCHS)
289-
list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0")
326+
list(REMOVE_ITEM _TGT_CUDA_ARCHS "10.0")
290327
set(_CUDA_ARCHS "10.0a")
291328
endif()
292329
endif()
293330

294-
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
331+
list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
295332

296333
# for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
297334
# is less or equal to ARCH (but has the same major version since SASS binary
298335
# compatibility is only forward compatible within the same major version).
299-
foreach(_ARCH ${TGT_CUDA_ARCHS_})
336+
foreach(_ARCH ${_TGT_CUDA_ARCHS})
300337
set(_TMP_ARCH)
301338
# Extract the major version of the target arch
302339
string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
303-
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
340+
foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS})
304341
# Extract the major version of the source arch
305342
string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
306-
# Check major-version match AND version-less-or-equal
343+
# Check version-less-or-equal, and allow PTX arches to match across majors
307344
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
308-
if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
345+
if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
309346
set(_TMP_ARCH "${_SRC_ARCH}")
310347
endif()
311348
else()
@@ -321,6 +358,18 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
321358
endforeach()
322359

323360
list(REMOVE_DUPLICATES _CUDA_ARCHS)
361+
362+
# reapply +PTX suffix to architectures that requested PTX
363+
set(_FINAL_ARCHS)
364+
foreach(_arch ${_CUDA_ARCHS})
365+
if(_arch IN_LIST _PTX_ARCHS)
366+
list(APPEND _FINAL_ARCHS "${_arch}+PTX")
367+
else()
368+
list(APPEND _FINAL_ARCHS "${_arch}")
369+
endif()
370+
endforeach()
371+
set(_CUDA_ARCHS ${_FINAL_ARCHS})
372+
324373
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
325374
endfunction()
326375

csrc/activation_kernels.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
7070
int64_t num_tokens = input.numel() / input.size(-1); \
7171
dim3 grid(num_tokens); \
7272
dim3 block(std::min(d, 1024)); \
73+
if (num_tokens == 0) { \
74+
return; \
75+
} \
7376
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
7477
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
7578
VLLM_DISPATCH_FLOATING_TYPES( \

csrc/attention/attention_kernels.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(
172172

173173
// Load the query to registers.
174174
// Each thread in a thread group has a different part of the query.
175-
// For example, if the the thread group size is 4, then the first thread in
175+
// For example, if the thread group size is 4, then the first thread in
176176
// the group has 0, 4, 8, ... th vectors of the query, and the second thread
177177
// has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
178178
// q is split from a qkv tensor, it may not be contiguous.
@@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(
259259

260260
// Load a key to registers.
261261
// Each thread in a thread group has a different part of the key.
262-
// For example, if the the thread group size is 4, then the first thread in
262+
// For example, if the thread group size is 4, then the first thread in
263263
// the group has 0, 4, 8, ... th vectors of the key, and the second thread
264264
// has 1, 5, 9, ... th vectors of the key, and so on.
265265
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {

csrc/cutlass_extensions/common.hpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,6 @@
1515
cutlassGetStatusString(error)); \
1616
}
1717

18-
/**
19-
* Panic wrapper for unwinding CUDA runtime errors
20-
*/
21-
#define CUDA_CHECK(status) \
22-
{ \
23-
cudaError_t error = status; \
24-
TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
25-
}
26-
2718
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
2819
int max_shared_mem_per_block_opt_in = 0;
2920
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,

0 commit comments

Comments
 (0)