Sync from v0.13
This commit is contained in:
144
tests/kernels/core/test_activation.py
Normal file
144
tests/kernels/core/test_activation.py
Normal file
@@ -0,0 +1,144 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.activation import (
|
||||
FastGELU,
|
||||
FatreluAndMul,
|
||||
GeluAndMul,
|
||||
MulAndSilu,
|
||||
NewGELU,
|
||||
QuickGELU,
|
||||
SiluAndMul,
|
||||
SwigluOAIAndMul,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
||||
D = [512, 13824] # Arbitrary values for testing
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"activation",
|
||||
[
|
||||
"silu_and_mul",
|
||||
"mul_and_silu",
|
||||
"gelu",
|
||||
"gelu_tanh",
|
||||
"fatrelu",
|
||||
"swigluoai_and_mul",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("d", D)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_act_and_mul(
|
||||
activation: str,
|
||||
num_tokens: int,
|
||||
d: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
||||
if activation == "silu_and_mul":
|
||||
layer = SiluAndMul()
|
||||
fn = torch.ops._C.silu_and_mul
|
||||
if activation == "mul_and_silu":
|
||||
layer = MulAndSilu()
|
||||
fn = torch.ops._C.mul_and_silu
|
||||
elif activation == "gelu":
|
||||
layer = GeluAndMul(approximate="none")
|
||||
fn = torch.ops._C.gelu_and_mul
|
||||
elif activation == "gelu_tanh":
|
||||
layer = GeluAndMul(approximate="tanh")
|
||||
fn = torch.ops._C.gelu_tanh_and_mul
|
||||
elif activation == "fatrelu":
|
||||
threshold = random.uniform(0, 1)
|
||||
layer = FatreluAndMul(threshold)
|
||||
fn = torch.ops._C.fatrelu_and_mul
|
||||
elif activation == "swigluoai_and_mul":
|
||||
layer = SwigluOAIAndMul()
|
||||
fn = torch.ops._C.swigluoai_and_mul
|
||||
out = layer(x)
|
||||
ref_out = layer.forward_native(x)
|
||||
if activation == "swigluoai_and_mul":
|
||||
rtol = {
|
||||
# For fp16, change the relative tolerance from 1e-3 to 2e-3
|
||||
torch.float16: 2e-3,
|
||||
torch.bfloat16: 2e-2,
|
||||
torch.float: 1.3e-6,
|
||||
}
|
||||
|
||||
def _get_rtol(output) -> float:
|
||||
return rtol[output.dtype]
|
||||
|
||||
torch.testing.assert_close(
|
||||
out, ref_out, atol=get_default_atol(out), rtol=_get_rtol(out)
|
||||
)
|
||||
else:
|
||||
# The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
|
||||
# equivalent to the native PyTorch implementations, so we can do exact
|
||||
# comparison.
|
||||
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
|
||||
|
||||
d = x.shape[-1] // 2
|
||||
output_shape = x.shape[:-1] + (d,)
|
||||
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
||||
if activation == "fatrelu":
|
||||
opcheck(fn, (out, x, threshold))
|
||||
elif activation == "swigluoai_and_mul":
|
||||
opcheck(fn, (out, x, layer.alpha, layer.limit))
|
||||
else:
|
||||
opcheck(fn, (out, x))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"activation",
|
||||
[
|
||||
(FastGELU, torch.ops._C.gelu_fast),
|
||||
(NewGELU, torch.ops._C.gelu_new),
|
||||
(QuickGELU, torch.ops._C.gelu_quick),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("d", D)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_activation(
|
||||
activation: type[torch.nn.Module],
|
||||
num_tokens: int,
|
||||
d: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
x = torch.randn(num_tokens, d, dtype=dtype)
|
||||
layer = activation[0]()
|
||||
fn = activation[1]
|
||||
out = layer(x)
|
||||
ref_out = layer.forward_native(x)
|
||||
torch.testing.assert_close(
|
||||
out, ref_out, atol=get_default_atol(out), rtol=get_default_rtol(out)
|
||||
)
|
||||
|
||||
out = torch.empty_like(x)
|
||||
opcheck(fn, (out, x))
|
||||
203
tests/kernels/core/test_apply_rotary_emb.py
Normal file
203
tests/kernels/core/test_apply_rotary_emb.py
Normal file
@@ -0,0 +1,203 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for ApplyRotaryEmb CustomOp dispatch behavior.
|
||||
|
||||
This test ensures that RotaryEmbedding classes correctly call the appropriate
|
||||
ApplyRotaryEmb methods based on the calling context:
|
||||
|
||||
1. RotaryEmbedding.forward_native() -> ApplyRotaryEmb.forward_native()
|
||||
2. RotaryEmbedding.forward_cuda() -> ApplyRotaryEmb.forward() (auto-dispatch)
|
||||
3. RotaryEmbedding.forward_hip() -> ApplyRotaryEmb.forward() (auto-dispatch)
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
VllmConfig,
|
||||
get_cached_compilation_config,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
CUDA_DEVICES = ["cuda:0"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RotaryEmbeddingTestCase:
|
||||
"""Test case configuration for RotaryEmbedding dispatch tests."""
|
||||
|
||||
name: str
|
||||
rope_class: type
|
||||
rope_kwargs: dict
|
||||
method_name: str # forward_native, forward_cuda, forward
|
||||
positions_shape: tuple # (num_tokens,) or (3, num_tokens) or (4, num_tokens)
|
||||
expect_forward_native: bool # Should call ApplyRotaryEmb.forward_native()
|
||||
expect_forward: bool # Should call ApplyRotaryEmb.forward()
|
||||
|
||||
|
||||
def get_test_cases() -> list[RotaryEmbeddingTestCase]:
|
||||
"""Generate test cases for all RotaryEmbedding classes."""
|
||||
from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
|
||||
Ernie4_5_VLRotaryEmbedding,
|
||||
)
|
||||
from vllm.model_executor.layers.rotary_embedding.mrope import MRotaryEmbedding
|
||||
from vllm.model_executor.layers.rotary_embedding.xdrope import XDRotaryEmbedding
|
||||
|
||||
common_kwargs = {
|
||||
"head_size": 128,
|
||||
"rotary_dim": 128,
|
||||
"max_position_embeddings": 4096,
|
||||
"base": 10000,
|
||||
"is_neox_style": True,
|
||||
"dtype": torch.bfloat16,
|
||||
}
|
||||
|
||||
return [
|
||||
# MRotaryEmbedding tests
|
||||
RotaryEmbeddingTestCase(
|
||||
name="MRotaryEmbedding.forward_native",
|
||||
rope_class=MRotaryEmbedding,
|
||||
rope_kwargs={**common_kwargs, "mrope_section": [16, 24, 24]},
|
||||
method_name="forward_native",
|
||||
positions_shape=(3, 32), # 2D for multimodal
|
||||
expect_forward_native=True,
|
||||
expect_forward=False,
|
||||
),
|
||||
RotaryEmbeddingTestCase(
|
||||
name="MRotaryEmbedding.forward_cuda_1d",
|
||||
rope_class=MRotaryEmbedding,
|
||||
rope_kwargs={**common_kwargs, "mrope_section": [16, 24, 24]},
|
||||
method_name="forward_cuda",
|
||||
positions_shape=(32,), # 1D triggers apply_rotary_emb path
|
||||
expect_forward_native=False,
|
||||
expect_forward=True,
|
||||
),
|
||||
# XDRotaryEmbedding tests
|
||||
RotaryEmbeddingTestCase(
|
||||
name="XDRotaryEmbedding.forward",
|
||||
rope_class=XDRotaryEmbedding,
|
||||
rope_kwargs={
|
||||
**common_kwargs,
|
||||
"scaling_alpha": 1.0,
|
||||
"xdrope_section": [16, 16, 16, 16],
|
||||
},
|
||||
method_name="forward",
|
||||
positions_shape=(4, 32), # 4D for P/W/H/T
|
||||
expect_forward_native=False,
|
||||
expect_forward=True,
|
||||
),
|
||||
# Ernie4_5_VLRotaryEmbedding tests
|
||||
RotaryEmbeddingTestCase(
|
||||
name="Ernie4_5_VLRotaryEmbedding.forward_native",
|
||||
rope_class=Ernie4_5_VLRotaryEmbedding,
|
||||
rope_kwargs={**common_kwargs, "mrope_section": [22, 22, 20]},
|
||||
method_name="forward_native",
|
||||
positions_shape=(3, 32), # 2D for multimodal
|
||||
expect_forward_native=True,
|
||||
expect_forward=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def run_dispatch_test(
|
||||
test_case: RotaryEmbeddingTestCase,
|
||||
device: str,
|
||||
):
|
||||
"""Run a dispatch test for a RotaryEmbedding class."""
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(custom_ops=["all", "+apply_rotary_emb"])
|
||||
)
|
||||
get_cached_compilation_config.cache_clear()
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
rope = test_case.rope_class(**test_case.rope_kwargs).to(device=device)
|
||||
|
||||
apply_rotary_emb = rope.apply_rotary_emb
|
||||
|
||||
# Verify custom op is enabled
|
||||
if test_case.expect_forward_native:
|
||||
assert (
|
||||
apply_rotary_emb._forward_method != apply_rotary_emb.forward_native
|
||||
), "Test setup error: ApplyRotaryEmb custom op should be enabled"
|
||||
|
||||
# Setup call tracking
|
||||
call_tracker = {"forward_native_called": False, "forward_called": False}
|
||||
original_forward_native = apply_rotary_emb.forward_native
|
||||
original_forward = apply_rotary_emb.forward
|
||||
|
||||
def tracked_forward_native(*args, **kwargs):
|
||||
call_tracker["forward_native_called"] = True
|
||||
return original_forward_native(*args, **kwargs)
|
||||
|
||||
def tracked_forward(*args, **kwargs):
|
||||
call_tracker["forward_called"] = True
|
||||
return original_forward(*args, **kwargs)
|
||||
|
||||
apply_rotary_emb.forward_native = tracked_forward_native
|
||||
apply_rotary_emb.forward = tracked_forward
|
||||
|
||||
try:
|
||||
num_tokens = test_case.positions_shape[-1]
|
||||
num_q_heads = 8
|
||||
num_kv_heads = 2
|
||||
head_size = test_case.rope_kwargs["head_size"]
|
||||
max_position = test_case.rope_kwargs["max_position_embeddings"]
|
||||
|
||||
positions = torch.randint(
|
||||
0, max_position // 4, test_case.positions_shape, device=device
|
||||
)
|
||||
query = torch.randn(
|
||||
num_tokens, num_q_heads * head_size, dtype=torch.bfloat16, device=device
|
||||
)
|
||||
key = torch.randn(
|
||||
num_tokens,
|
||||
num_kv_heads * head_size,
|
||||
dtype=torch.bfloat16,
|
||||
device=device,
|
||||
)
|
||||
|
||||
# Call the method under test
|
||||
method = getattr(rope, test_case.method_name)
|
||||
method(positions, query.clone(), key.clone())
|
||||
|
||||
# Verify expectations
|
||||
if test_case.expect_forward_native:
|
||||
assert call_tracker["forward_native_called"], (
|
||||
f"{test_case.name} should call ApplyRotaryEmb.forward_native()"
|
||||
)
|
||||
if not test_case.expect_forward:
|
||||
assert not call_tracker["forward_called"], (
|
||||
f"{test_case.name} should NOT call ApplyRotaryEmb.forward(). "
|
||||
"Bug: when +apply_rotary_emb is enabled, forward_native() "
|
||||
"incorrectly dispatches to CUDA/HIP kernels."
|
||||
)
|
||||
if test_case.expect_forward:
|
||||
assert call_tracker["forward_called"], (
|
||||
f"{test_case.name} should call ApplyRotaryEmb.forward()"
|
||||
)
|
||||
finally:
|
||||
apply_rotary_emb.forward_native = original_forward_native
|
||||
apply_rotary_emb.forward = original_forward
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
|
||||
)
|
||||
@pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda tc: tc.name)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_rotary_embedding_dispatch(
|
||||
test_case: RotaryEmbeddingTestCase,
|
||||
device: str,
|
||||
):
|
||||
"""
|
||||
Test that RotaryEmbedding classes dispatch to the correct ApplyRotaryEmb method.
|
||||
|
||||
- forward_native methods should call ApplyRotaryEmb.forward_native()
|
||||
- forward_cuda/forward methods should call ApplyRotaryEmb.forward()
|
||||
"""
|
||||
run_dispatch_test(test_case, device)
|
||||
141
tests/kernels/core/test_fused_qk_norm_rope.py
Normal file
141
tests/kernels/core/test_fused_qk_norm_rope.py
Normal file
@@ -0,0 +1,141 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.bfloat16, torch.float16]
|
||||
IS_NEOX = [True, False]
|
||||
EPS_VALUES = [1e-5, 1e-6]
|
||||
SEEDS = [13]
|
||||
CUDA_DEVICES = ["cuda:0"]
|
||||
|
||||
|
||||
def _apply_qk_norm_rope(
|
||||
qkv: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
q_norm: RMSNorm,
|
||||
k_norm: RMSNorm,
|
||||
rope: RotaryEmbedding,
|
||||
num_heads_q: int,
|
||||
num_heads_kv: int,
|
||||
head_dim: int,
|
||||
) -> torch.Tensor:
|
||||
q_size = num_heads_q * head_dim
|
||||
kv_size = num_heads_kv * head_dim
|
||||
|
||||
q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
|
||||
|
||||
q_by_head = q.view(*q.shape[:-1], q.shape[-1] // head_dim, head_dim)
|
||||
q_by_head = q_norm.forward_native(q_by_head)
|
||||
q = q_by_head.view(q.shape)
|
||||
|
||||
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // head_dim, head_dim)
|
||||
k_by_head = k_norm.forward_native(k_by_head)
|
||||
k = k_by_head.view(k.shape)
|
||||
|
||||
q, k = rope.forward_native(positions, q, k)
|
||||
return torch.cat([q, k, v], dim=-1)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda_alike(),
|
||||
reason="fused_qk_norm_rope custom op requires cuda and rocm platform",
|
||||
)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("is_neox", IS_NEOX)
|
||||
@pytest.mark.parametrize("eps", EPS_VALUES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@torch.inference_mode()
|
||||
def test_fused_qk_norm_rope_matches_reference(
|
||||
device: str,
|
||||
dtype: torch.dtype,
|
||||
is_neox: bool,
|
||||
eps: float,
|
||||
seed: int,
|
||||
):
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
num_heads, num_kv_heads, head_dim = 16, 4, 128
|
||||
num_tokens = 4
|
||||
|
||||
total_dim = (num_heads + 2 * num_kv_heads) * head_dim
|
||||
qkv_base = torch.randn(num_tokens, total_dim, dtype=dtype, device=device)
|
||||
qkv_fused = qkv_base.clone()
|
||||
positions = torch.arange(num_tokens, dtype=torch.long, device=device)
|
||||
|
||||
q_norm = RMSNorm(head_dim, eps=eps).to(device=device, dtype=dtype)
|
||||
k_norm = RMSNorm(head_dim, eps=eps).to(device=device, dtype=dtype)
|
||||
q_norm.weight.data.normal_(mean=1.0, std=0.1)
|
||||
k_norm.weight.data.normal_(mean=1.0, std=0.1)
|
||||
q_weight = q_norm.weight.data
|
||||
k_weight = k_norm.weight.data
|
||||
|
||||
rope = RotaryEmbedding(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position_embeddings=4096,
|
||||
base=10000.0,
|
||||
is_neox_style=is_neox,
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
|
||||
ref_result = _apply_qk_norm_rope(
|
||||
qkv=qkv_base,
|
||||
positions=positions,
|
||||
q_norm=q_norm,
|
||||
k_norm=k_norm,
|
||||
rope=rope,
|
||||
num_heads_q=num_heads,
|
||||
num_heads_kv=num_kv_heads,
|
||||
head_dim=head_dim,
|
||||
)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C.fused_qk_norm_rope,
|
||||
(
|
||||
qkv_fused.clone(),
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
eps,
|
||||
q_weight,
|
||||
k_weight,
|
||||
rope.cos_sin_cache,
|
||||
is_neox,
|
||||
positions.view(-1),
|
||||
),
|
||||
)
|
||||
|
||||
torch.ops._C.fused_qk_norm_rope(
|
||||
qkv_fused,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
eps,
|
||||
q_weight,
|
||||
k_weight,
|
||||
rope.cos_sin_cache,
|
||||
is_neox,
|
||||
positions.view(-1),
|
||||
)
|
||||
|
||||
if dtype == torch.float16:
|
||||
ATOL, RTOL = (2e-3, 2e-3)
|
||||
else:
|
||||
ATOL, RTOL = (1e-2, 1e-2)
|
||||
|
||||
torch.testing.assert_close(
|
||||
qkv_fused,
|
||||
ref_result,
|
||||
atol=ATOL,
|
||||
rtol=RTOL,
|
||||
)
|
||||
237
tests/kernels/core/test_fused_quant_layernorm.py
Normal file
237
tests/kernels/core/test_fused_quant_layernorm.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.int8_utils import (
|
||||
per_token_group_quant_int8,
|
||||
)
|
||||
|
||||
DTYPES = [torch.bfloat16, torch.float]
|
||||
QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
|
||||
VEC_HIDDEN_SIZES = [1024, 1025, 1027, 1029]
|
||||
# Avoid combinatorial explosion with full Cartesian product
|
||||
NUM_TOKENS_HIDDEN_SIZES = [
|
||||
*[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
|
||||
*[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
|
||||
*[(4096, i) for i in [1, 64, 5137]],
|
||||
]
|
||||
|
||||
ADD_RESIDUAL = [False, True]
|
||||
SCALE_UBS = [True, False]
|
||||
GROUP_SIZES = [None, [1, 64], [1, 128]]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
|
||||
EPS = 1e-6
|
||||
|
||||
## Helpers
|
||||
|
||||
|
||||
def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
|
||||
return torch.as_tensor(x, dtype=torch.float32, device="cuda")
|
||||
|
||||
|
||||
def ref_rms_norm(
|
||||
rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
if residual is not None:
|
||||
residual = residual.clone()
|
||||
out, residual = rms_norm_layer.forward_native(x, residual)
|
||||
else:
|
||||
out = rms_norm_layer.forward_native(x)
|
||||
|
||||
return out, residual
|
||||
|
||||
|
||||
def ref_dynamic_per_token_or_block_quant(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
residual: torch.Tensor | None,
|
||||
scale_ub: torch.Tensor | None,
|
||||
group_size: list[int] | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
if scale_ub is not None:
|
||||
assert quant_dtype == torch.float8_e4m3fn
|
||||
|
||||
# Norm
|
||||
torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
|
||||
|
||||
# Quant
|
||||
if group_size is not None:
|
||||
if quant_dtype == torch.float8_e4m3fn:
|
||||
torch_out, scales = per_token_group_quant_fp8(
|
||||
torch_out, group_size=group_size[1], use_ue8m0=False
|
||||
)
|
||||
else:
|
||||
assert quant_dtype == torch.int8
|
||||
torch_out, scales = per_token_group_quant_int8(
|
||||
torch_out, group_size=group_size[1]
|
||||
)
|
||||
else:
|
||||
if quant_dtype == torch.float8_e4m3fn:
|
||||
torch_out, scales = ops.scaled_fp8_quant(
|
||||
torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
|
||||
)
|
||||
else:
|
||||
assert quant_dtype == torch.int8
|
||||
torch_out, scales, _ = ops.scaled_int8_quant(torch_out)
|
||||
|
||||
return torch_out, scales, residual
|
||||
|
||||
|
||||
def ref_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
residual: torch.Tensor | None,
|
||||
scale_ub: torch.Tensor | None,
|
||||
group_size: list[int] | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
return ref_dynamic_per_token_or_block_quant(
|
||||
rms_norm_layer, x, quant_dtype, residual, scale_ub, group_size
|
||||
)
|
||||
|
||||
|
||||
def ops_dynamic_per_token_or_block_quant(
|
||||
weight: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
residual: torch.Tensor | None,
|
||||
scale_ub: torch.Tensor | None,
|
||||
group_size: list[int] | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
if residual is not None:
|
||||
residual = residual.clone()
|
||||
if group_size is not None:
|
||||
out, scales = ops.rms_norm_per_block_quant(
|
||||
x, weight, EPS, quant_dtype, group_size, scale_ub, residual, True
|
||||
)
|
||||
scales = scales.contiguous()
|
||||
else:
|
||||
out, scales = ops.rms_norm_dynamic_per_token_quant(
|
||||
x, weight, EPS, quant_dtype, scale_ub, residual
|
||||
)
|
||||
return out, scales, residual
|
||||
|
||||
|
||||
def ops_impl(
|
||||
weight: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
residual: torch.Tensor | None,
|
||||
scale_ub: torch.Tensor | None,
|
||||
group_size: list[int] | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
return ops_dynamic_per_token_or_block_quant(
|
||||
weight, x, quant_dtype, residual, scale_ub, group_size
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||
@pytest.mark.parametrize("has_scale_ub", SCALE_UBS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
|
||||
@pytest.mark.parametrize("group_size", GROUP_SIZES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_rms_norm(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
add_residual: bool,
|
||||
has_scale_ub: bool,
|
||||
dtype: torch.dtype,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int] | None,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
torch.random.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
if group_size is not None and hidden_size % group_size[1] != 0:
|
||||
# skip
|
||||
return
|
||||
|
||||
if group_size is not None and has_scale_ub:
|
||||
# blockwise baseline doesn't support scale_ub
|
||||
return
|
||||
|
||||
if has_scale_ub and quant_dtype != torch.float8_e4m3fn:
|
||||
# skip
|
||||
return
|
||||
|
||||
layer = RMSNorm(hidden_size, EPS).to(dtype=dtype)
|
||||
|
||||
# Make weights
|
||||
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||
|
||||
# Make inputs
|
||||
scale = 1 / (hidden_size)
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
|
||||
residual = torch.randn_like(x) * scale if add_residual else None
|
||||
if has_scale_ub:
|
||||
rms_x, _ = ref_rms_norm(layer, x, residual)
|
||||
scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device="cuda")
|
||||
else:
|
||||
scale_ub = None
|
||||
|
||||
ref_out, ref_scales, ref_residual = ref_impl(
|
||||
layer, x, quant_dtype, residual, scale_ub, group_size
|
||||
)
|
||||
ops_out, ops_scales, ops_residual = ops_impl(
|
||||
layer.weight, x, quant_dtype, residual, scale_ub, group_size
|
||||
)
|
||||
|
||||
assert ref_out.dtype == quant_dtype
|
||||
assert ops_out.dtype == quant_dtype
|
||||
if quant_dtype == torch.int8:
|
||||
assert torch.allclose(ref_scales, ops_scales, atol=1e-6)
|
||||
# big atol to account for round-off errors.
|
||||
assert torch.allclose(ref_out, ops_out, atol=1)
|
||||
else:
|
||||
assert torch.allclose(ref_scales, ops_scales)
|
||||
a = ref_out.to(dtype=torch.float32)
|
||||
b = ops_out.to(dtype=torch.float32)
|
||||
ok = torch.allclose(a, b, atol=1e-6)
|
||||
if not ok:
|
||||
# fallback: compare dequantized values with relaxed tolerance
|
||||
if group_size is None:
|
||||
a_deq = a * ref_scales.view(-1, 1)
|
||||
b_deq = b * ops_scales.view(-1, 1)
|
||||
else:
|
||||
a_deq = a * ref_scales.repeat_interleave(group_size[1], dim=1)
|
||||
b_deq = b * ops_scales.repeat_interleave(group_size[1], dim=1)
|
||||
# NOTE: It is possible that some future test cases trigger this
|
||||
# max diff due to precision issues. If such an error is
|
||||
# encountered, it's recommended to inspect the differences between
|
||||
# all corresponding elements from each tensor (e.g. by looping over
|
||||
# them) and checking how many the max diff error shows up on (just
|
||||
# a few bad elements should still be considered acceptable).
|
||||
ok = torch.allclose(a_deq, b_deq, rtol=5e-2, atol=5e-2)
|
||||
assert ok
|
||||
if add_residual:
|
||||
assert torch.allclose(ref_residual, ops_residual)
|
||||
|
||||
output = torch.empty_like(x, dtype=quant_dtype)
|
||||
scales = torch.empty(
|
||||
(x.numel() // x.shape[-1], 1), device=x.device, dtype=torch.float32
|
||||
)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C.rms_norm_dynamic_per_token_quant,
|
||||
(output, x, layer.weight, scales, 1e-5, scale_ub, residual),
|
||||
)
|
||||
153
tests/kernels/core/test_layernorm.py
Normal file
153
tests/kernels/core/test_layernorm.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.quant_utils import FP8_DTYPE
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
|
||||
HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192] # Arbitrary values for testing
|
||||
ADD_RESIDUAL = [False, True]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("strided_input", [False, True])
|
||||
@torch.inference_mode()
|
||||
def test_rms_norm(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
add_residual: bool,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
strided_input: bool,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
||||
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||
scale = 1 / (2 * hidden_size)
|
||||
last_dim = 2 * hidden_size if strided_input else hidden_size
|
||||
x = torch.randn(num_tokens, last_dim, dtype=dtype)
|
||||
x = x[..., :hidden_size]
|
||||
assert x.is_contiguous() != strided_input
|
||||
x *= scale
|
||||
residual = torch.randn_like(x) * scale if add_residual else None
|
||||
|
||||
# NOTE(woosuk): The reference implementation should be executed first
|
||||
# because the custom kernel is in-place.
|
||||
ref_out = layer.forward_native(x, residual)
|
||||
out = layer(x, residual)
|
||||
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
|
||||
# numerical errors than other operators because they involve reductions.
|
||||
# Therefore, we use a larger tolerance.
|
||||
if add_residual:
|
||||
torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
|
||||
torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
|
||||
else:
|
||||
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
|
||||
|
||||
if residual is not None:
|
||||
opcheck(
|
||||
torch.ops._C.fused_add_rms_norm,
|
||||
(x, residual, layer.weight.data, layer.variance_epsilon),
|
||||
)
|
||||
else:
|
||||
opcheck(
|
||||
torch.ops._C.rms_norm, (out, x, layer.weight.data, layer.variance_epsilon)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0])
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("strided_input", [False, True])
|
||||
def test_fused_rms_norm_quant(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
add_residual: bool,
|
||||
dtype: torch.dtype,
|
||||
quant_scale: float,
|
||||
seed: int,
|
||||
device: str,
|
||||
strided_input: bool,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
|
||||
scale = 1 / (2 * hidden_size)
|
||||
last_dim = 2 * hidden_size if strided_input else hidden_size
|
||||
x_base = torch.randn(num_tokens, last_dim, dtype=dtype)
|
||||
x = x_base[..., :hidden_size]
|
||||
assert x.is_contiguous() != strided_input
|
||||
|
||||
x *= scale
|
||||
if add_residual:
|
||||
residual = torch.randn_like(x) * scale
|
||||
residual_fused = residual.clone()
|
||||
else:
|
||||
residual = residual_fused = None
|
||||
|
||||
out_norm = torch.empty_like(x)
|
||||
out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
|
||||
out_quant_fused = torch.empty_like(out_quant)
|
||||
|
||||
quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
|
||||
|
||||
if add_residual:
|
||||
torch.ops._C.fused_add_rms_norm_static_fp8_quant(
|
||||
out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
|
||||
)
|
||||
|
||||
# Unfused kernel is in-place so it goes second
|
||||
# Also use a separate clone of x to avoid modifying the input
|
||||
x_unfused_base = x_base.clone()
|
||||
x_unfused = x_unfused_base[..., :hidden_size]
|
||||
assert x_unfused.is_contiguous() != strided_input
|
||||
torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
|
||||
torch.ops._C.static_scaled_fp8_quant(
|
||||
out_quant, x_unfused.contiguous(), quant_scale_t
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
|
||||
opcheck(
|
||||
torch.ops._C.fused_add_rms_norm_static_fp8_quant,
|
||||
(out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
|
||||
)
|
||||
else:
|
||||
torch.ops._C.rms_norm_static_fp8_quant(
|
||||
out_quant_fused, x, weight, quant_scale_t, 1e-6
|
||||
)
|
||||
|
||||
torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
|
||||
torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C.rms_norm_static_fp8_quant,
|
||||
(out_quant_fused, x, weight, quant_scale_t, 1e-6),
|
||||
)
|
||||
|
||||
torch.testing.assert_close(
|
||||
out_quant.to(dtype=torch.float32),
|
||||
out_quant_fused.to(dtype=torch.float32),
|
||||
atol=1e-3,
|
||||
rtol=1e-3,
|
||||
)
|
||||
253
tests/kernels/core/test_mrope.py
Normal file
253
tests/kernels/core/test_mrope.py
Normal file
@@ -0,0 +1,253 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def generate_test_data(
|
||||
num_tokens: int,
|
||||
num_q_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
max_position_embeddings: int,
|
||||
dtype: torch.dtype,
|
||||
device: torch.device,
|
||||
):
|
||||
"""Generate test data for given configuration."""
|
||||
current_platform.seed_everything(42)
|
||||
# Create 2D positions (3, num_tokens) for multimodal case
|
||||
positions = torch.randint(
|
||||
0, max_position_embeddings // 4, (3, num_tokens), device=device
|
||||
)
|
||||
|
||||
# Create query and key tensors
|
||||
query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
|
||||
key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
|
||||
|
||||
return positions, query, key
|
||||
|
||||
|
||||
class MRoPETestInfo(NamedTuple):
|
||||
model_name: str
|
||||
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
|
||||
atol: float = 1e-2
|
||||
rtol: float = 1.6e-2
|
||||
marks: list[pytest.MarkDecorator] = []
|
||||
|
||||
|
||||
TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version
|
||||
|
||||
MODELS_TO_TEST = [
|
||||
MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"),
|
||||
MRoPETestInfo(
|
||||
model_name="Qwen/Qwen3-VL-4B-Instruct",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
|
||||
reason="Qwen3-VL only available after Transformers v4.57",
|
||||
)
|
||||
],
|
||||
),
|
||||
MRoPETestInfo(
|
||||
model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
|
||||
reason="Qwen3-VL only available after Transformers v4.57",
|
||||
)
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
num_tokens_list = [11, 8192]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model_info, model_name",
|
||||
[
|
||||
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope(
|
||||
model_name: str,
|
||||
model_info: MRoPETestInfo,
|
||||
tp_size: int,
|
||||
dtype: torch.dtype,
|
||||
num_tokens: int,
|
||||
):
|
||||
atol = model_info.atol
|
||||
rtol = model_info.rtol
|
||||
|
||||
config = get_config(model_name, False).get_text_config()
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = (
|
||||
config.head_dim
|
||||
if hasattr(config, "head_dim")
|
||||
else config.hidden_size // total_num_heads
|
||||
)
|
||||
is_neox_style = True
|
||||
|
||||
max_position = config.max_position_embeddings
|
||||
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=config.rope_parameters,
|
||||
dtype=dtype,
|
||||
).to(device=device)
|
||||
|
||||
# create q k v input tensors
|
||||
# create rotary pos emb input tensors
|
||||
positions, query, key = generate_test_data(
|
||||
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
|
||||
)
|
||||
|
||||
query_native, key_native = mrope_helper_class.forward_native(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
query_cuda, key_cuda = mrope_helper_class.forward_cuda(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
torch.testing.assert_close(query_native, query_cuda, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model_info, model_name",
|
||||
[
|
||||
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope_torch_compile_tracing(
|
||||
model_name: str,
|
||||
model_info: MRoPETestInfo,
|
||||
tp_size: int,
|
||||
dtype: torch.dtype,
|
||||
num_tokens: int,
|
||||
):
|
||||
atol = model_info.atol
|
||||
rtol = model_info.rtol
|
||||
|
||||
config = get_config(model_name, False).get_text_config()
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = (
|
||||
config.head_dim
|
||||
if hasattr(config, "head_dim")
|
||||
else config.hidden_size // total_num_heads
|
||||
)
|
||||
is_neox_style = True
|
||||
max_position = config.max_position_embeddings
|
||||
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=config.rope_parameters,
|
||||
dtype=dtype,
|
||||
).to(device=device)
|
||||
|
||||
# Generate test data
|
||||
positions, query, key = generate_test_data(
|
||||
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
|
||||
)
|
||||
|
||||
# Create a wrapper that makes the in-place function appear functional
|
||||
def functional_forward_cuda(pos, q, k):
|
||||
"""Wrapper that converts in-place operation to functional style
|
||||
|
||||
CUDA Graph does not support in-place operations.
|
||||
This wrapper creates working copies of the
|
||||
input tensors and modifies them.
|
||||
"""
|
||||
q_work = q.clone() # Create working copies
|
||||
k_work = k.clone()
|
||||
# Your in-place function modifies q_work and k_work
|
||||
mrope_helper_class.forward_cuda(pos, q_work, k_work)
|
||||
return q_work, k_work # Return the modified tensors
|
||||
|
||||
# Get reference results
|
||||
query_native, key_native = mrope_helper_class.forward_native(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
try:
|
||||
compiled_forward_cuda = torch.compile(
|
||||
functional_forward_cuda,
|
||||
fullgraph=True,
|
||||
backend="inductor",
|
||||
mode="reduce-overhead",
|
||||
dynamic=False,
|
||||
)
|
||||
|
||||
# Run compiled version
|
||||
query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
)
|
||||
|
||||
# Run original version for comparison
|
||||
query_cuda = query.clone()
|
||||
key_cuda = key.clone()
|
||||
mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda)
|
||||
|
||||
# Verify results
|
||||
torch.testing.assert_close(
|
||||
query_compiled_cuda, query_cuda, atol=atol, rtol=rtol
|
||||
)
|
||||
torch.testing.assert_close(key_compiled_cuda, key_cuda, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(
|
||||
query_compiled_cuda, query_native, atol=atol, rtol=rtol
|
||||
)
|
||||
torch.testing.assert_close(key_compiled_cuda, key_native, atol=atol, rtol=rtol)
|
||||
|
||||
print("✓ forward_cuda successfully traced with torch.compile inductor")
|
||||
|
||||
except Exception as e:
|
||||
pytest.fail(f"forward_cuda failed to trace with torch.compile inductor: {e}")
|
||||
26
tests/kernels/core/test_opcheck.py
Normal file
26
tests/kernels/core/test_opcheck.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for miscellaneous utilities
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
|
||||
|
||||
def test_convert_fp8_opcheck():
|
||||
data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
|
||||
result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
|
||||
opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
|
||||
|
||||
|
||||
# TODO: Add this back, currently fails with
|
||||
# csrc/cuda_utils_kernels.cu:15 'invalid argument'
|
||||
# @pytest.mark.skipif(not current_platform.is_cuda(),
|
||||
# reason="Only supported for CUDA")
|
||||
# def test_cuda_utils_opcheck():
|
||||
# opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
|
||||
# opcheck(
|
||||
# torch.ops._C_cuda_utils.
|
||||
# get_max_shared_memory_per_block_device_attribute, (0, ))
|
||||
18
tests/kernels/core/test_permute_cols.py
Normal file
18
tests/kernels/core/test_permute_cols.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm._custom_ops import permute_cols
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shape", [(1, 512), (544, 4096), (67, 8192)])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
def test_permute_cols(shape, dtype):
|
||||
x = torch.randn(shape, dtype=dtype).cuda()
|
||||
perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
|
||||
opcheck(torch.ops._C.permute_cols, (x, perm))
|
||||
y = permute_cols(x, perm)
|
||||
torch.testing.assert_close(y, x[:, perm])
|
||||
193
tests/kernels/core/test_pos_encoding.py
Normal file
193
tests/kernels/core/test_pos_encoding.py
Normal file
@@ -0,0 +1,193 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Callable
|
||||
from itertools import product
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
IS_NEOX_STYLE = [True, False]
|
||||
DTYPES = [torch.bfloat16, torch.float]
|
||||
HEAD_SIZES = [64, 80, 120, 256]
|
||||
ROTARY_DIMS = [None, 32] # None means rotary dim == head size
|
||||
NUM_HEADS = [17] # Arbitrary values for testing
|
||||
BATCH_SIZES = [5] # Arbitrary values for testing
|
||||
SEQ_LENS = [11, 8192] # Arbitrary values for testing
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
USE_KEY = [True, False]
|
||||
|
||||
|
||||
def _get_flat_tensor_shape(
|
||||
batch_size: int, seq_len: int, num_heads: int, head_size: int
|
||||
) -> tuple[int, ...]:
|
||||
return (batch_size, seq_len, num_heads * head_size)
|
||||
|
||||
|
||||
# For testing sliced tensors
|
||||
def _get_padded_tensor_shape(
|
||||
batch_size: int, seq_len: int, num_heads: int, head_size: int
|
||||
) -> tuple[int, ...]:
|
||||
return (batch_size, seq_len, num_heads, head_size + 64)
|
||||
|
||||
|
||||
def _get_batch_tensor_shape(
|
||||
batch_size: int, seq_len: int, num_heads: int, head_size: int
|
||||
) -> tuple[int, ...]:
|
||||
return (batch_size, seq_len, num_heads, head_size)
|
||||
|
||||
|
||||
TENSORS_SHAPES_FN = [
|
||||
_get_batch_tensor_shape,
|
||||
_get_flat_tensor_shape,
|
||||
_get_padded_tensor_shape,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
||||
@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("use_key", USE_KEY)
|
||||
@torch.inference_mode()
|
||||
def test_rotary_embedding(
|
||||
is_neox_style: bool,
|
||||
tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]],
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: int | None,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
use_key: bool,
|
||||
max_position: int = 8192,
|
||||
rope_theta: float = 10000,
|
||||
) -> None:
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope_parameters = {
|
||||
"rope_type": "default",
|
||||
"rope_theta": rope_theta,
|
||||
"partial_rotary_factor": rotary_dim / head_size,
|
||||
}
|
||||
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
|
||||
rope = rope.to(dtype=dtype, device=torch.get_default_device())
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
|
||||
query = torch.randn(query_shape, dtype=dtype)
|
||||
key = torch.randn_like(query) if use_key else None
|
||||
|
||||
# slice tensor if required, noop otherwise
|
||||
query = query[..., :head_size]
|
||||
key = key[..., :head_size] if use_key else None
|
||||
|
||||
# NOTE(woosuk): The reference implementation should be executed first
|
||||
# because the custom kernel is in-place.
|
||||
ref_query, ref_key = rope.forward_native(positions, query, key)
|
||||
out_query, out_key = rope.forward(positions, query, key)
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(
|
||||
out_query,
|
||||
ref_query,
|
||||
atol=get_default_atol(out_query),
|
||||
rtol=get_default_rtol(out_query),
|
||||
)
|
||||
if use_key:
|
||||
torch.testing.assert_close(
|
||||
out_key,
|
||||
ref_key,
|
||||
atol=get_default_atol(out_key),
|
||||
rtol=get_default_rtol(out_key),
|
||||
)
|
||||
else:
|
||||
assert ref_key is None and out_key is None, "expected returned key to be None"
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def test_rope_module_cache():
|
||||
MAX_POSITIONS = [123, 1234]
|
||||
ROPE_THETAS = [10000, 1000000]
|
||||
ROPE_PARAMETERS = (
|
||||
{"rope_type": "default"},
|
||||
{"rope_type": "linear", "factor": (1,)},
|
||||
{"rope_type": "dynamic", "factor": 1},
|
||||
)
|
||||
settings = (
|
||||
HEAD_SIZES,
|
||||
ROTARY_DIMS,
|
||||
MAX_POSITIONS,
|
||||
ROPE_THETAS,
|
||||
IS_NEOX_STYLE,
|
||||
ROPE_PARAMETERS,
|
||||
DTYPES,
|
||||
)
|
||||
rope_setting_id_map: dict[str, int] = {}
|
||||
for setting in product(*settings):
|
||||
(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
rope_theta,
|
||||
is_neox_style,
|
||||
rope_parameters,
|
||||
dtype,
|
||||
) = setting
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
|
||||
rope = get_rope(
|
||||
head_size,
|
||||
max_position,
|
||||
is_neox_style,
|
||||
rope_parameters,
|
||||
dtype,
|
||||
)
|
||||
# different settings cannot share the same rope module
|
||||
assert id(rope) not in rope_setting_id_map.values()
|
||||
assert all(x.dtype == dtype for x in rope.buffers())
|
||||
assert all(x.dtype == dtype for x in rope.parameters())
|
||||
rope_setting_id_map[str(setting)] = id(rope)
|
||||
|
||||
for setting in product(*settings):
|
||||
(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
rope_theta,
|
||||
is_neox_style,
|
||||
rope_parameters,
|
||||
dtype,
|
||||
) = setting
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
|
||||
rope = get_rope(
|
||||
head_size,
|
||||
max_position,
|
||||
is_neox_style,
|
||||
rope_parameters,
|
||||
dtype,
|
||||
)
|
||||
# check if cache take effect
|
||||
assert id(rope) == rope_setting_id_map[str(setting)]
|
||||
76
tests/kernels/core/test_rotary_embedding.py
Normal file
76
tests/kernels/core/test_rotary_embedding.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for miscellaneous utilities
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
||||
|
||||
|
||||
def rotary_embedding_opcheck(
|
||||
rot,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
):
|
||||
cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
|
||||
|
||||
# ops.rotary_embedding() is a in-place operation
|
||||
# that updates the query and key tensors.
|
||||
opcheck(
|
||||
torch.ops._C.rotary_embedding,
|
||||
(positions, query, key, rot.head_size, cos_sin_cache, rot.is_neox_style),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cuda"])
|
||||
@pytest.mark.parametrize("max_position", [11, 4096, 32768])
|
||||
@pytest.mark.parametrize("is_neox_style", [True, False])
|
||||
@pytest.mark.parametrize("rotary_dim", [32])
|
||||
@pytest.mark.parametrize("head_size", [32, 108])
|
||||
@pytest.mark.parametrize("seq_len", [11, 1024])
|
||||
@pytest.mark.parametrize("use_key", [True, False])
|
||||
@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
|
||||
def test_rotary_embedding_opcheck(
|
||||
dist_init,
|
||||
device,
|
||||
max_position,
|
||||
is_neox_style,
|
||||
rotary_dim,
|
||||
head_size,
|
||||
seq_len,
|
||||
use_key,
|
||||
head_stride_is_contiguous,
|
||||
):
|
||||
batch_size = 1
|
||||
base = 10000
|
||||
num_heads = 7
|
||||
rot = RotaryEmbedding(
|
||||
head_size, rotary_dim, max_position, base, is_neox_style, torch.float32
|
||||
)
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
|
||||
head_stride = head_size + (64 if head_stride_is_contiguous else 0)
|
||||
|
||||
query = torch.randn(
|
||||
batch_size, seq_len, num_heads, head_stride, dtype=torch.float32, device=device
|
||||
)
|
||||
key = torch.randn_like(query) if use_key else None
|
||||
query = query[..., :head_size]
|
||||
key = key[..., :head_size] if use_key else None
|
||||
|
||||
rotary_embedding_opcheck(rot, positions, query, key)
|
||||
|
||||
# if we have a contiguous head stride, test the alternate
|
||||
# [..., num_heads * head_dim] shape/layout
|
||||
if head_stride_is_contiguous:
|
||||
rotary_embedding_opcheck(
|
||||
rot,
|
||||
positions,
|
||||
query.flatten(start_dim=-2),
|
||||
key.flatten(start_dim=-2) if use_key else None,
|
||||
)
|
||||
53
tests/kernels/core/test_uva.py
Normal file
53
tests/kernels/core/test_uva.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.utils.platform_utils import is_uva_available
|
||||
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
|
||||
|
||||
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_cpu_write(device):
|
||||
torch.set_default_device(device)
|
||||
cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
|
||||
cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
|
||||
assert cuda_view.device.type == "cuda"
|
||||
|
||||
assert cuda_view[0, 0] == 0
|
||||
assert cuda_view[2, 3] == 0
|
||||
assert cuda_view[4, 5] == 0
|
||||
|
||||
cpu_tensor[0, 0] = 1
|
||||
cpu_tensor[2, 3] = 2
|
||||
cpu_tensor[4, 5] = -1
|
||||
|
||||
cuda_view.mul_(2)
|
||||
assert cuda_view[0, 0] == 2
|
||||
assert cuda_view[2, 3] == 4
|
||||
assert cuda_view[4, 5] == -2
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_gpu_write(device):
|
||||
torch.set_default_device(device)
|
||||
cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
|
||||
cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
|
||||
assert cuda_view.device.type == "cuda"
|
||||
|
||||
assert cuda_view[0, 0] == 0
|
||||
assert cuda_view[2, 3] == 0
|
||||
assert cuda_view[4, 5] == 0
|
||||
|
||||
cuda_view[0, 0] = 1
|
||||
cuda_view[2, 3] = 2
|
||||
cuda_view[4, 5] = -1
|
||||
cuda_view.mul_(2)
|
||||
|
||||
assert cpu_tensor[0, 0] == 2
|
||||
assert cpu_tensor[2, 3] == 4
|
||||
assert cpu_tensor[4, 5] == -2
|
||||
Reference in New Issue
Block a user