[310p] Add a PyTorch implementation of the GDN gating operator on 310P (#7430)

### What this PR does / why we need it? RFC #7394 Add a PyTorch implementation of the GDN gating operator on 310P. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? UT - vLLM version: v0.17.0 - vLLM main: 4497431df6 Signed-off-by: Tflowers-0129 <2906339855@qq.com>
2026-03-23 20:26:39 +08:00
parent e344a53127
commit 13397e9cb7
3 changed files with 116 additions and 0 deletions
--- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_gdn_gating.py
+++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_fused_gdn_gating.py
@@ -0,0 +1,51 @@
+import torch
+
+from vllm_ascend._310p.ops.fla.fused_gdn_gating import fused_gdn_gating_pytorch
+from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch
+from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton
+
+
+def test_fused_gdn_gating_310p_parity_precision():
+    init_device_properties_triton()
+    torch.manual_seed(0)
+    device = "npu"
+
+    num_tokens = 37
+    num_heads = 8
+
+    A_log = torch.randn(num_heads, dtype=torch.float16, device=device)
+    dt_bias = torch.randn(num_heads, dtype=torch.float16, device=device)
+    a = torch.randn(num_tokens, num_heads, dtype=torch.float16, device=device)
+    b = torch.randn(num_tokens, num_heads, dtype=torch.float16, device=device)
+
+    triton_g, triton_beta = fused_gdn_gating_patch(
+        A_log=A_log,
+        a=a,
+        b=b,
+        dt_bias=dt_bias,
+        beta=1.0,
+        threshold=20.0,
+    )
+    ref_g, ref_beta = fused_gdn_gating_pytorch(
+        A_log=A_log,
+        a=a,
+        b=b,
+        dt_bias=dt_bias,
+        beta=1.0,
+        threshold=20.0,
+    )
+
+    torch.testing.assert_close(
+        triton_g.to(torch.float32).cpu(),
+        ref_g.to(torch.float32).cpu(),
+        rtol=1e-2,
+        atol=1e-2,
+        equal_nan=True,
+    )
+    torch.testing.assert_close(
+        triton_beta.to(torch.float32).cpu(),
+        ref_beta.to(torch.float32).cpu(),
+        rtol=1e-2,
+        atol=1e-2,
+        equal_nan=True,
+    )
--- a/vllm_ascend/_310p/ops/fla/init.py
+++ b/vllm_ascend/_310p/ops/fla/init.py
@@ -0,0 +1,3 @@
+from .fused_gdn_gating import fused_gdn_gating_pytorch
+
+__all__ = ["fused_gdn_gating_pytorch"]
--- a/vllm_ascend/_310p/ops/fla/fused_gdn_gating.py
+++ b/vllm_ascend/_310p/ops/fla/fused_gdn_gating.py
@@ -0,0 +1,62 @@
+import torch
+
+
+def fused_gdn_gating_pytorch(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    dt_bias: torch.Tensor,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    PyTorch implementation of fused_gdn_gating.
+    This is a fallback implementation for 310P without Triton support.
+
+    Args:
+        A_log: Log of A parameter, shape [num_heads]
+        a: a parameter, shape [batch, num_heads]
+        b: b parameter, shape [batch, num_heads]
+        dt_bias: dt bias, shape [num_heads]
+        beta: softplus beta parameter
+        threshold: softplus threshold parameter
+
+    Returns:
+        g: gating parameter, shape [1, batch, num_heads]
+        beta_output: sigmoid(b), shape [1, batch, num_heads]
+    """
+    batch, num_heads = a.shape
+    del num_heads
+    # Keep nonlinear gating math in fp32 for stability.
+    compute_dtype = torch.float32
+    A_log_f = A_log.to(compute_dtype)
+    a_f = a.to(compute_dtype)
+    b_f = b.to(compute_dtype)
+    dt_bias_f = dt_bias.to(compute_dtype)
+
+    # Expand A_log and dt_bias to match a shape.
+    A_log_expanded = A_log_f.unsqueeze(0).expand(batch, -1)
+    dt_bias_expanded = dt_bias_f.unsqueeze(0).expand(batch, -1)
+
+    # Compute x = a + dt_bias.
+    x = a_f + dt_bias_expanded
+
+    # Compute softplus(x).
+    beta_x = beta * x
+    softplus_x = torch.where(
+        beta_x <= threshold,
+        (1.0 / beta) * torch.log1p(torch.exp(beta_x)),
+        x,
+    )
+
+    # Compute g = -exp(A_log) * softplus(x).
+    g = -torch.exp(A_log_expanded) * softplus_x
+
+    # Add sequence dimension.
+    g = g.unsqueeze(0)
+
+    # Match Triton kernel: sigmoid in fp32, then cast to input b dtype.
+    beta_output = torch.sigmoid(b_f).to(b.dtype)
+    beta_output = beta_output.unsqueeze(0)
+
+    return g, beta_output