Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/kernels/test_fla_layernorm_guard.py
+++ b/tests/kernels/test_fla_layernorm_guard.py
@@ -0,0 +1,388 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.fla.ops.layernorm_guard import (
+    layer_norm_fwd,
+    layernorm_fn,
+    rms_norm_ref,
+)
+from vllm.platforms import current_platform
+
+
+def layer_norm_ref(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    """Reference implementation for both layer norm and RMS norm."""
+    if is_rms_norm:
+        # Use the imported rms_norm_ref for RMS norm cases
+        return rms_norm_ref(
+            x,
+            weight,
+            bias,
+            z=z,
+            eps=eps,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            upcast=True,
+        )
+
+    # Layer norm implementation
+    dtype = x.dtype
+    x = x.float()
+    weight = weight.float()
+    bias = bias.float() if bias is not None else None
+    z = z.float() if z is not None else None
+
+    if z is not None and not norm_before_gate:
+        x = x * F.silu(z)
+
+    if group_size is None:
+        # Layer norm: subtract mean
+        mean = x.mean(dim=-1, keepdim=True)
+        var = ((x - mean).square()).mean(dim=-1, keepdim=True)
+        rstd = 1 / torch.sqrt(var + eps)
+        out = (x - mean) * rstd * weight
+        if bias is not None:
+            out = out + bias
+    else:
+        # Group norm
+        from einops import rearrange
+
+        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
+        mean = x_group.mean(dim=-1, keepdim=True)
+        var = ((x_group - mean).square()).mean(dim=-1, keepdim=True)
+        rstd = 1 / torch.sqrt(var + eps)
+        x_group = (x_group - mean) * rstd
+        out = rearrange(x_group, "... g d -> ... (g d)") * weight
+        if bias is not None:
+            out = out + bias
+
+    if z is not None and norm_before_gate:
+        out *= F.silu(z)
+
+    return out.to(dtype)
+
+
+DTYPES = [torch.bfloat16, torch.float32]
+# Test various M sizes to ensure rows_per_block logic works correctly
+NUM_TOKENS = [
+    1,
+    7,
+    16,
+    63,
+    128,
+    256,
+    512,
+    1024,
+    2048,
+    4096,
+    5789,
+    8189,
+    8191,
+    16383,
+    32767,
+]
+HIDDEN_SIZES = [64, 128, 256, 1024]
+GROUP_SIZES = [None, 64, 128]  # None means full hidden size
+NORM_BEFORE_GATE = [True, False]
+IS_RMS_NORM = [True, False]
+SEEDS = [0, 42]
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_basic(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    is_rms_norm: bool,
+) -> None:
+    """Test basic layer norm forward pass without z (gate) tensor."""
+    current_platform.seed_everything(seed)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, is_rms_norm=is_rms_norm
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=is_rms_norm)
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    # Check mean and rstd shapes
+    if not is_rms_norm:
+        assert mean.shape == (num_tokens,)
+    assert rstd.shape == (num_tokens,)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", [128, 256, 1024])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("norm_before_gate", NORM_BEFORE_GATE)
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_with_gate(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    norm_before_gate: bool,
+    is_rms_norm: bool,
+) -> None:
+    """Test layer norm forward pass with z (gate) tensor."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    z = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x,
+        weight,
+        bias,
+        eps,
+        z=z,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=is_rms_norm,
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x,
+        weight,
+        bias,
+        z=z,
+        eps=eps,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=is_rms_norm,
+    )
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("num_tokens", [128, 512])
+@pytest.mark.parametrize("hidden_size", [512, 1024])
+@pytest.mark.parametrize("group_size", [64, 128, 256])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_with_groups(
+    num_tokens: int,
+    hidden_size: int,
+    group_size: int,
+    dtype: torch.dtype,
+    is_rms_norm: bool,
+) -> None:
+    """Test layer norm forward pass with group normalization."""
+    if hidden_size % group_size != 0:
+        pytest.skip(
+            f"hidden_size {hidden_size} not divisible by group_size {group_size}"
+        )
+
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    ngroups = hidden_size // group_size
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, group_size=group_size, is_rms_norm=is_rms_norm
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x, weight, bias, z=None, eps=eps, group_size=group_size, is_rms_norm=is_rms_norm
+    )
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    # Check mean and rstd shapes for groups
+    if not is_rms_norm:
+        assert mean.shape == (ngroups * num_tokens,)
+    assert rstd.shape == (ngroups * num_tokens,)
+
+
+@pytest.mark.parametrize("num_tokens", [7, 63, 128, 513, 1024, 2049])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_layer_norm_rows_per_block(
+    num_tokens: int,
+    dtype: torch.dtype,
+) -> None:
+    """Test that rows_per_block logic works correctly for various M sizes."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    hidden_size = 1024
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(x, weight, bias, eps, z=None, is_rms_norm=False)
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@torch.inference_mode()
+def test_strided_input(dtype: torch.dtype) -> None:
+    """Test that the kernel handles non-contiguous (strided)
+    inputs correctly."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    num_tokens = 128
+    hidden_size = 1024
+
+    # Create a larger tensor and take a strided slice
+    x_large = torch.randn(num_tokens, hidden_size * 2, dtype=dtype, device=device)
+    x = x_large[:, :hidden_size]
+
+    # Make it contiguous for the kernel
+    x_contiguous = x.contiguous()
+
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel with contiguous input
+    out, mean, rstd = layer_norm_fwd(
+        x_contiguous, weight, bias, eps, z=None, is_rms_norm=False
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x_contiguous, weight, bias, z=None, eps=eps, is_rms_norm=False
+    )
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("num_tokens", [1, 128, 2048])
+@pytest.mark.parametrize("hidden_size", [768, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_output_buffer_provided(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+) -> None:
+    """Test that the kernel works when an output buffer is provided."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Pre-allocate output buffer
+    out_buffer = torch.empty_like(x)
+
+    # Run the triton kernel with provided output
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, out=out_buffer, is_rms_norm=False
+    )
+
+    # Check that the provided buffer was used
+    assert out.data_ptr() == out_buffer.data_ptr()
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (4, 16, 1024),  # 3D tensor
+        (2, 8, 512, 256),  # 4D tensor
+    ],
+)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_multidimensional_input(
+    shape: tuple,
+    dtype: torch.dtype,
+) -> None:
+    """Test that the autograd function handles multidimensional inputs."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    hidden_size = shape[-1]
+
+    # Create inputs
+    x = torch.randn(*shape, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run through autograd function
+    out = layernorm_fn(x, weight, bias, z=None, eps=eps)
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    assert out.shape == x.shape
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+if __name__ == "__main__":
+    # Run a quick smoke test
+    test_layer_norm_fwd_basic(128, 1024, torch.float16, 42, False)
+    test_layer_norm_fwd_with_gate(128, 1024, torch.float16, True, False)
+    test_layer_norm_rows_per_block(513, torch.float16)
+    print("All smoke tests passed!")