Skip to content

Commit cb262f3

Browse files
mgoinliuzijing2014
authored andcommitted
Categorize tests/kernels/ based on kernel type (vllm-project#16799)
Signed-off-by: mgoin <[email protected]>
1 parent a1ae5f5 commit cb262f3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+79
-48
lines changed

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import pytest
1717
import yaml
1818

19-
RTOL = 0.05
19+
RTOL = 0.08
2020
TEST_DATA_FILE = os.environ.get(
2121
"LM_EVAL_TEST_DATA_FILE",
2222
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")

.buildkite/test-pipeline.yaml

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -317,15 +317,46 @@ steps:
317317
commands:
318318
- pytest -v -s compile/test_full_graph.py
319319

320-
- label: Kernels Test %N # 1h each
321-
mirror_hardwares: [amd]
320+
- label: Kernels Core Operation Test
322321
source_file_dependencies:
323322
- csrc/
323+
- tests/kernels/core
324+
commands:
325+
- pytest -v -s kernels/core
326+
327+
- label: Kernels Attention Test %N
328+
source_file_dependencies:
329+
- csrc/attention/
324330
- vllm/attention
325-
- tests/kernels
331+
- vllm/v1/attention
332+
- tests/kernels/attention
326333
commands:
327-
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
328-
parallelism: 4
334+
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
335+
parallelism: 2
336+
337+
- label: Kernels Quantization Test %N
338+
source_file_dependencies:
339+
- csrc/quantization/
340+
- vllm/model_executor/layers/quantization
341+
- tests/kernels/quantization
342+
commands:
343+
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
344+
parallelism: 2
345+
346+
- label: Kernels MoE Test
347+
source_file_dependencies:
348+
- csrc/moe/
349+
- tests/kernels/moe
350+
- vllm/model_executor/layers/fused_moe/
351+
commands:
352+
- pytest -v -s kernels/moe
353+
354+
- label: Kernels Mamba Test
355+
source_file_dependencies:
356+
- csrc/mamba/
357+
- tests/kernels/mamba
358+
commands:
359+
- pytest -v -s kernels/mamba
329360

330361
- label: Tensorizer Test # 11min
331362
# mirror_hardwares: [amd]
File renamed without changes.

tests/kernels/test_attention.py renamed to tests/kernels/attention/test_attention.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,12 @@
66
import pytest
77
import torch
88

9+
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
910
from tests.kernels.utils import opcheck
1011
from vllm import _custom_ops as ops
1112
from vllm.platforms import current_platform
1213
from vllm.utils import get_max_shared_memory_bytes
1314

14-
from .allclose_default import get_default_atol, get_default_rtol
15-
1615
if not current_platform.is_rocm():
1716
from xformers import ops as xops
1817
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask

tests/kernels/test_attention_selector.py renamed to tests/kernels/attention/test_attention_selector.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,15 @@ def test_env(
156156
expected = ("TRITON_MLA_VLLM_V1"
157157
if use_v1 else "TRITON_MLA")
158158
assert backend.get_name() == expected
159+
elif name == "FLASHINFER":
160+
backend = get_attn_backend(16,
161+
torch.float16,
162+
torch.float16,
163+
block_size,
164+
False,
165+
use_mla=use_mla)
166+
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
167+
assert backend.get_name() == expected
159168
else:
160169
backend = get_attn_backend(16,
161170
torch.float16,

tests/kernels/test_blocksparse_attention.py renamed to tests/kernels/attention/test_blocksparse_attention.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,13 @@
66
import pytest
77
import torch
88

9+
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
910
from vllm import _custom_ops as ops
1011
from vllm.attention.ops.blocksparse_attention.interface import (
1112
LocalStridedBlockSparseAttn)
1213
from vllm.platforms import current_platform
1314
from vllm.utils import get_max_shared_memory_bytes
1415

15-
from .allclose_default import get_default_atol, get_default_rtol
16-
1716
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
1817
# This will change depending on the compute capability.
1918
# - 512 as a buffer
File renamed without changes.

tests/kernels/test_activation.py renamed to tests/kernels/core/test_activation.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@
55
import pytest
66
import torch
77

8+
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
89
from tests.kernels.utils import opcheck
910
from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
1011
GeluAndMul, MulAndSilu,
1112
NewGELU, QuickGELU,
1213
SiluAndMul)
1314
from vllm.platforms import current_platform
1415

15-
from .allclose_default import get_default_atol, get_default_rtol
16-
1716
DTYPES = [torch.half, torch.bfloat16, torch.float]
1817
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
1918
D = [512, 13824] # Arbitrary values for testing

tests/kernels/core/test_opcheck.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
"""
3+
Tests for miscellaneous utilities
4+
"""
5+
6+
import torch
7+
8+
from tests.kernels.utils import opcheck
9+
10+
11+
def test_convert_fp8_opcheck():
12+
data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
13+
result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
14+
opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
15+
16+
17+
# TODO: Add this back, currently fails with
18+
# csrc/cuda_utils_kernels.cu:15 'invalid argument'
19+
# @pytest.mark.skipif(not current_platform.is_cuda(),
20+
# reason="Only supported for CUDA")
21+
# def test_cuda_utils_opcheck():
22+
# opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
23+
# opcheck(
24+
# torch.ops._C_cuda_utils.
25+
# get_max_shared_memory_per_block_device_attribute, (0, ))

tests/kernels/test_pos_encoding.py renamed to tests/kernels/core/test_pos_encoding.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@
66
import pytest
77
import torch
88

9+
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
910
from vllm.model_executor.layers.rotary_embedding import get_rope
1011
from vllm.platforms import current_platform
1112

12-
from .allclose_default import get_default_atol, get_default_rtol
13-
1413
IS_NEOX_STYLE = [True, False]
1514
DTYPES = [torch.half, torch.bfloat16, torch.float]
1615
HEAD_SIZES = [64, 80, 112, 120, 256]
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

tests/kernels/test_block_fp8.py renamed to tests/kernels/quantization/test_block_fp8.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77
import torch
88

9+
from tests.kernels.utils_block import native_w8a8_block_matmul
910
from vllm.config import VllmConfig, set_current_vllm_config
1011
from vllm.model_executor.layers.activation import SiluAndMul
1112
from vllm.model_executor.layers.fused_moe import fused_moe
@@ -18,8 +19,6 @@
1819
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
1920
from vllm.platforms import current_platform
2021

21-
from .utils_block import native_w8a8_block_matmul
22-
2322
dg_available = False
2423
try:
2524
import deep_gemm

tests/kernels/test_block_int8.py renamed to tests/kernels/quantization/test_block_int8.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,14 @@
66
import pytest
77
import torch
88

9+
from tests.kernels.utils_block import native_w8a8_block_matmul
910
from vllm.config import VllmConfig, set_current_vllm_config
1011
from vllm.model_executor.layers.activation import SiluAndMul
1112
from vllm.model_executor.layers.fused_moe import fused_moe
1213
from vllm.model_executor.layers.quantization.utils.int8_utils import (
1314
w8a8_block_int8_matmul)
1415
from vllm.platforms import current_platform
1516

16-
from .utils_block import native_w8a8_block_matmul
17-
1817
if current_platform.get_device_capability() < (7, 0):
1918
pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
2019
allow_module_level=True)

tests/kernels/test_cutlass_2of4_sparse.py renamed to tests/kernels/quantization/test_cutlass_2of4_sparse.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@
77
import pytest
88
import torch
99

10+
from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
1011
from vllm import _custom_ops as ops
1112
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
1213
sparse_cutlass_supported)
1314
from vllm.platforms import current_platform
1415

15-
from .utils import baseline_scaled_mm, to_fp8, to_int8
16-
1716
CUDA_DEVICES = [
1817
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
1918
]

tests/kernels/test_cutlass.py renamed to tests/kernels/quantization/test_cutlass_scaled_mm.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,11 @@
88
import pytest
99
import torch
1010

11-
from tests.kernels.utils import opcheck
11+
from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
1212
from vllm import _custom_ops as ops
1313
from vllm.platforms import current_platform
1414
from vllm.utils import cdiv
1515

16-
from .utils import baseline_scaled_mm, to_fp8, to_int8
17-
1816
MNK_FACTORS = [
1917
(1, 256, 128),
2018
(1, 16384, 1024),
File renamed without changes.
File renamed without changes.
File renamed without changes.

tests/kernels/test_utils.py

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)