Skip to content

Commit cc463fe

Browse files
committed
Merge branch 'tag-upstream-v0.8.5' into upstream-v0.8.5
2 parents 1e358ff + ba41cc9 commit cc463fe

File tree

690 files changed

+39305
-13284
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

690 files changed

+39305
-13284
lines changed

.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
23
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For hf script, without -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
23
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For hf script, without -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
23
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
23
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
23
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
23
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
23
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
23
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
23
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
23
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
1+
# For hf script, without -t option (tensor parallel size).
2+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
23
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
34
tasks:
45
- name: "gsm8k"

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
23
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
34
tasks:

.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
23
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
34
tasks:

.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
23
model_name: "mgoin/Minitron-4B-Base-FP8"
34
tasks:

.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
23
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
34
tasks:

.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
23
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
34
tasks:

.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
1+
# For hf script, without -t option (tensor parallel size).
2+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
23
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
34
tasks:
45
- name: "gsm8k"
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
23
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
34
tasks:
45
- name: "gsm8k"
56
metrics:
67
- name: "exact_match,strict-match"
7-
value: 0.31
8+
value: 0.30
89
- name: "exact_match,flexible-extract"
9-
value: 0.47
10+
value: 0.465
1011
limit: 1319
1112
num_fewshot: 5

.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
23
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
34
tasks:

.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
23
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
34
tasks:

.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
23
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
34
tasks:

.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
23
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
34
tasks:

.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# For vllm script, with -t option (tensor parallel size).
12
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
23
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
34
tasks:

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import pytest
1717
import yaml
1818

19-
RTOL = 0.05
19+
RTOL = 0.08
2020
TEST_DATA_FILE = os.environ.get(
2121
"LM_EVAL_TEST_DATA_FILE",
2222
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")

.buildkite/release-pipeline.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,18 @@ steps:
8686
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
8787
env:
8888
DOCKER_BUILDKIT: "1"
89+
90+
- block: "Build Neuron release image"
91+
key: block-neuron-release-image-build
92+
depends_on: ~
93+
94+
- label: "Build and publish Neuron release image"
95+
depends_on: block-neuron-release-image-build
96+
agents:
97+
queue: neuron-postmerge
98+
commands:
99+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
100+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
101+
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
102+
env:
103+
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ if [[ $commands == *" kernels "* ]]; then
9898
--ignore=kernels/test_machete_mm.py \
9999
--ignore=kernels/test_mha_attn.py \
100100
--ignore=kernels/test_block_fp8.py \
101+
--ignore=kernels/test_cutlass_moe.py \
102+
--ignore=kernels/test_mamba_ssm_ssd.py \
103+
--ignore=kernels/test_attention.py \
104+
--ignore=kernels/test_block_int8.py \
105+
--ignore=kernels/test_fused_quant_layernorm.py \
106+
--ignore=kernels/test_int8_kernel.py \
107+
--ignore=kernels/test_triton_moe_ptpc_fp8.py \
101108
--ignore=kernels/test_permute_cols.py"
102109
fi
103110

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,41 @@
55
set -ex
66

77
# Setup cleanup
8-
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
8+
remove_docker_container() {
9+
if [[ -n "$container_id" ]]; then
10+
podman rm -f "$container_id" || true
11+
fi
12+
podman system prune -f
13+
}
914
trap remove_docker_container EXIT
1015
remove_docker_container
1116

1217
# Try building the docker image
13-
docker build -t cpu-test -f docker/Dockerfile.ppc64le .
18+
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
19+
20+
# Run the image
21+
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
22+
23+
function cpu_tests() {
24+
25+
# offline inference
26+
podman exec -it "$container_id" bash -c "
27+
set -e
28+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
29+
30+
# Run basic model test
31+
podman exec -it "$container_id" bash -c "
32+
set -e
33+
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
34+
pip install sentence-transformers datamodel_code_generator
35+
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
36+
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
37+
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
38+
}
39+
40+
# All of CPU tests are expected to be finished less than 40 mins.
41+
42+
export container_id
43+
export -f cpu_tests
44+
timeout 40m bash -c cpu_tests
1445

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
# This script build the CPU docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# Setup cleanup
8+
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
9+
trap remove_docker_container EXIT
10+
remove_docker_container
11+
12+
# Try building the docker image
13+
docker build -t cpu-test -f docker/Dockerfile.s390x .

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@ source /etc/environment
1717
docker run --privileged --net host --shm-size=16G -it \
1818
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
1919
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
20-
&& python3 -m pip install pytest \
20+
&& python3 -m pip install pytest pytest-asyncio tpu-info \
2121
&& python3 -m pip install lm_eval[api]==0.4.4 \
22+
&& export VLLM_XLA_CACHE_PATH= \
2223
&& export VLLM_USE_V1=1 \
2324
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
25+
&& echo HARDWARE \
26+
&& tpu-info \
2427
&& echo TEST_0 \
2528
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
2629
&& echo TEST_1 \
@@ -40,7 +43,11 @@ docker run --privileged --net host --shm-size=16G -it \
4043
&& echo TEST_8 \
4144
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
4245
&& echo TEST_9 \
43-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
46+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
47+
&& echo TEST_10 \
48+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
49+
&& echo TEST_11 \
50+
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
4451

4552

4653
# TODO: This test fails because it uses RANDOM_SEED sampling

0 commit comments

Comments
 (0)