Skip to content

Categorize tests/kernels/ based on kernel type #16799

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import pytest
import yaml

RTOL = 0.05
RTOL = 0.08
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
Expand Down
41 changes: 36 additions & 5 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -313,15 +313,46 @@ steps:
commands:
- pytest -v -s compile/test_full_graph.py

- label: Kernels Test %N # 1h each
mirror_hardwares: [amd]
- label: Kernels Core Operation Test
source_file_dependencies:
- csrc/
- tests/kernels/core
commands:
- pytest -v -s kernels/core

- label: Kernels Attention Test %N
source_file_dependencies:
- csrc/attention/
- vllm/attention
- tests/kernels
- vllm/v1/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2

- label: Kernels Quantization Test %N
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2

- label: Kernels MoE Test
source_file_dependencies:
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
commands:
- pytest -v -s kernels/moe

- label: Kernels Mamba Test
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
commands:
- pytest -v -s kernels/mamba

- label: Tensorizer Test # 11min
# mirror_hardwares: [amd]
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
import pytest
import torch

from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes

from .allclose_default import get_default_atol, get_default_rtol

if not current_platform.is_rocm():
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,15 @@ def test_env(
expected = ("TRITON_MLA_VLLM_V1"
if use_v1 else "TRITON_MLA")
assert backend.get_name() == expected
elif name == "FLASHINFER":
backend = get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
assert backend.get_name() == expected
else:
backend = get_attn_backend(16,
torch.float16,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
import pytest
import torch

from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm import _custom_ops as ops
from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn)
from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes

from .allclose_default import get_default_atol, get_default_rtol

FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
import pytest
import torch

from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck
from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
GeluAndMul, MulAndSilu,
NewGELU, QuickGELU,
SiluAndMul)
from vllm.platforms import current_platform

from .allclose_default import get_default_atol, get_default_rtol

DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
D = [512, 13824] # Arbitrary values for testing
Expand Down
25 changes: 25 additions & 0 deletions tests/kernels/core/test_opcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-License-Identifier: Apache-2.0
"""
Tests for miscellaneous utilities
"""

import torch

from tests.kernels.utils import opcheck


def test_convert_fp8_opcheck():
data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))


# TODO: Add this back, currently fails with
# csrc/cuda_utils_kernels.cu:15 'invalid argument'
# @pytest.mark.skipif(not current_platform.is_cuda(),
# reason="Only supported for CUDA")
# def test_cuda_utils_opcheck():
# opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
# opcheck(
# torch.ops._C_cuda_utils.
# get_max_shared_memory_per_block_device_attribute, (0, ))
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
import pytest
import torch

from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform

from .allclose_default import get_default_atol, get_default_rtol

IS_NEOX_STYLE = [True, False]
DTYPES = [torch.half, torch.bfloat16, torch.float]
HEAD_SIZES = [64, 80, 112, 120, 256]
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
import torch

from tests.kernels.utils_block import native_w8a8_block_matmul
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import fused_moe
Expand All @@ -18,8 +19,6 @@
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
from vllm.platforms import current_platform

from .utils_block import native_w8a8_block_matmul

dg_available = False
try:
import deep_gemm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@
import pytest
import torch

from tests.kernels.utils_block import native_w8a8_block_matmul
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import fused_moe
from vllm.model_executor.layers.quantization.utils.int8_utils import (
w8a8_block_int8_matmul)
from vllm.platforms import current_platform

from .utils_block import native_w8a8_block_matmul

if current_platform.get_device_capability() < (7, 0):
pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
allow_module_level=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@
import pytest
import torch

from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
sparse_cutlass_supported)
from vllm.platforms import current_platform

from .utils import baseline_scaled_mm, to_fp8, to_int8

CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@
import pytest
import torch

from tests.kernels.utils import opcheck
from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import cdiv

from .utils import baseline_scaled_mm, to_fp8, to_int8

MNK_FACTORS = [
(1, 256, 128),
(1, 16384, 1024),
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
25 changes: 0 additions & 25 deletions tests/kernels/test_utils.py

This file was deleted.