[Feature] Merge branch 'Qwen3-Next' into main && Support Qwen-next (#222)

Signed-off-by: xyDong0223 <dongxinyu03@baidu.com> Co-authored-by: xyDong0223 <dongxinyu03@baidu.com>
2026-02-28 11:15:50 +08:00
parent 153093d3b3
commit 82544aa0cc
17 changed files with 2668 additions and 1532 deletions
--- a/vllm_kunlun/ops/fla/chunk.py
+++ b/vllm_kunlun/ops/fla/chunk.py
@@ -9,60 +9,196 @@
 # ruff: noqa: E501
 import warnings
 from typing import Optional
-import torch.nn.functional as F

+import cocopod  # noqa
 import torch
-import torch.distributed as dist
+import torch.nn.functional as F
 from einops import rearrange

-from .chunk_delta_h import chunk_gated_delta_rule_fwd_h
-from .chunk_o import chunk_fwd_o
-from .chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
-from .cumsum import chunk_local_cumsum
+from .index import prepare_chunk_indices, prepare_chunk_offsets
 from .l2norm import l2norm_fwd
-from .solve_tril import solve_tril
 from .utils import SUPPRESS_LEVEL, input_guard
-from .wy_fast import recompute_w_u_fwd
-from .index import prepare_chunk_indices
-import xspeedgate_ops
-import cocopod


-def torch_solve_tril(A: torch.Tensor, cu_seqlens: Optional[torch.LongTensor] = None, output_dtype: torch.dtype = torch.float,):
-    chunk_size=64
-    A = -A.transpose(1,2)
+def torch_solve_tril(
+    A: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    output_dtype: torch.dtype = torch.float,
+):
+    chunk_size = 64
+    A = -A.transpose(1, 2)
    sequence_length = A.shape[-2]
    pad_size = (chunk_size - sequence_length % chunk_size) % chunk_size
    A = F.pad(A, (0, 0, 0, pad_size))
    A = A.reshape(A.shape[0], A.shape[1], -1, chunk_size, A.shape[-1])
+    # mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=A.device), diagonal=0)

+    # A = A.masked_fill(mask, 0)
    for i in range(1, chunk_size):
        row = A[..., i, :i].clone()
        sub = A[..., :i, :i].clone()
        A[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
    A = A + torch.eye(chunk_size, dtype=A.dtype, device=A.device)
-    return A.reshape(A.shape[0], A.shape[1], -1, A.shape[-1])[:,:,:sequence_length,:].transpose(1,2)
+    return A.reshape(A.shape[0], A.shape[1], -1, A.shape[-1])[
+        :, :, :sequence_length, :
+    ].transpose(1, 2)

-def chunk_gated_delta_rule_fwd(q: torch.Tensor,
-                               k: torch.Tensor,
-                               v: torch.Tensor,
-                               g: torch.Tensor,
-                               beta: torch.Tensor,
-                               scale: float,
-                               initial_state: torch.Tensor,
-                               output_final_state: bool,
-                               cu_seqlens: Optional[torch.LongTensor] = None):
-    g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
-    A = chunk_scaled_dot_kkt_fwd(k=k,
-                                 beta=beta,
-                                 g_cumsum=g,
-                                 cu_seqlens=cu_seqlens,
-                                 output_dtype=q.dtype)

-    #kernel版
-    torch.ops.xspeedgate_ops.solve_tril_fwd(A, cu_seqlens)
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, 64) if cu_seqlens is not None else None
+def recompute_w_u_fwd_torch(
+    k: torch.Tensor,  # [B, T, H, K]
+    v: torch.Tensor,  # [B, T, H, V]
+    beta: torch.Tensor,  # [B, T, H]
+    g: torch.Tensor,  # [B, T, H]
+    A: torch.Tensor,  # [B, H, T, T]
+):
+    """
+    最简单版本：假设等长序列，key和value头数相同
+    """
+    chunk_size = 64
+    num_v_heads, num_k_heads = v.shape[2], k.shape[2]
+    k = k.repeat_interleave(num_v_heads // num_k_heads, dim=2)
+    k, v, beta, g, A = [
+        x.transpose(1, 2).contiguous().to(torch.float32) for x in (k, v, beta, g, A)
+    ]
+
+    batch_size, num_heads, sequence_length, k_head_dim = k.shape
+    pad_size = (chunk_size - sequence_length % chunk_size) % chunk_size
+    k = F.pad(k, (0, 0, 0, pad_size))
+    v = F.pad(v, (0, 0, 0, pad_size))
+    beta = F.pad(beta, (0, pad_size))
+    g = F.pad(g, (0, pad_size))
+    A = F.pad(A, (0, 0, 0, pad_size))
+    A = A.reshape(A.shape[0], A.shape[1], -1, chunk_size, A.shape[-1])
+
+    v_beta = v * beta.unsqueeze(-1)
+    k_beta = k * beta.unsqueeze(-1)
+
+    k, v, k_beta, v_beta = [
+        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1])
+        for x in (k, v, k_beta, v_beta)
+    ]
+    g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
+
+    u = A @ v_beta
+    w = A @ (k_beta * g.exp().unsqueeze(-1))
+    w = (
+        w.reshape(w.shape[0], w.shape[1], -1, w.shape[-1])[:, :, :sequence_length, :]
+        .transpose(1, 2)
+        .contiguous()
+    )
+    u = (
+        u.reshape(u.shape[0], u.shape[1], -1, u.shape[-1])[:, :, :sequence_length, :]
+        .transpose(1, 2)
+        .contiguous()
+    )
+
+    return w, u
+
+
+def split_by_value(tensor, chunk_size=64):
+    indices = tensor.tolist()
+    result = set(indices)  # 使用集合避免重复
+
+    for i in range(len(indices) - 1):
+        start = indices[i]
+        end = indices[i + 1]
+
+        # 计算第一个对齐边界
+        # 我们要找的是 start + n*chunk_size，其中n是使结果大于start的最小整数
+        first_boundary = start + chunk_size
+
+        # 在(start, end)范围内插入所有对齐边界
+        boundary = first_boundary
+        while boundary < end:
+            result.add(boundary)
+            boundary += chunk_size
+
+    return torch.tensor(sorted(result), dtype=tensor.dtype, device=tensor.device)
+
+
+def chunk_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+):
+    chunk_size = 64
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, 64) if cu_seqlens is not None else None
+    )
+    chunk_offsets = (
+        prepare_chunk_offsets(cu_seqlens, chunk_size)
+        if cu_seqlens is not None
+        else None
+    )
+
+    # !
+    # g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
+    g = torch.ops.xspeedgate_ops.chunk_local_cumsum(
+        g,
+        chunk_size=64,
+        reverse=False,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        head_first=False,
+    )
+
+    # !
+    # A = chunk_scaled_dot_kkt_fwd(k=k,
+    #                              beta=beta,
+    #                              g_cumsum=g,
+    #                              cu_seqlens=cu_seqlens,
+    #                              output_dtype=q.dtype)
+    A = torch.ops.xspeedgate_ops.chunk_scaled_dot_kkt_fwd(
+        k, beta, g, cu_seqlens, chunk_indices, chunk_size
+    )
+
+    # torch版
+    # if get_tensor_model_parallel_rank() == 0:
+    #     torch.save(A, "A_in")
+    #     torch.save(cu_seqlens, "cu_seqlens")
+    # A2 = A.clone()
+    torch.ops.xspeedgate_ops.solve_tril_ns(A, cu_seqlens, chunk_indices, chunk_size)
+
+    # !
+    # torch.ops.xspeedgate_ops.solve_tril_fwd(A, cu_seqlens)
+    # if get_tensor_model_parallel_rank() == 0:
+    #     err = torch.max(torch.abs(A - A2))
+    #     print("err", err)
+    #     if err > 1e-3:
+    #         raise
+    # A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
+    # for i in range(len(cu_seqlens)-1):
+    #     A_i = A[:, cu_seqlens[i]:cu_seqlens[i+1], :, :]
+    #     A[:, cu_seqlens[i]:cu_seqlens[i+1], :, :] = torch_solve_tril(A=A_i, cu_seqlens=torch.tensor([0, cu_seqlens[i+1]-cu_seqlens[i]], device=q.device), output_dtype=k.dtype)
+
+    """
+    B, T, Hg, K, V = *k.shape, v.shape[-1]
+    H = v.shape[-2]
+    u = torch.empty_like(v)
+    w = k.new_empty(B, T, H, K)
+    for i in range(len(cu_seqlens)-1):
+        k_i = k[:, cu_seqlens[i]:cu_seqlens[i+1], :, :]
+        v_i = v[:, cu_seqlens[i]:cu_seqlens[i+1], :, :]
+        beta_i = beta[:, cu_seqlens[i]:cu_seqlens[i+1], :]
+        A_i = A[:, cu_seqlens[i]:cu_seqlens[i+1], :, :]
+        g_i = g[:, cu_seqlens[i]:cu_seqlens[i+1], :]
+
+        w_i, u_i = recompute_w_u_fwd_torch(
+            k=k_i,
+            v=v_i,
+            beta=beta_i,
+            A=A_i,
+            g=g_i,
+        )
+        w[:, cu_seqlens[i]:cu_seqlens[i+1], :, :] = w_i
+        u[:, cu_seqlens[i]:cu_seqlens[i+1], :, :] = u_i
+    """
    w, u = torch.ops.xspeedgate_ops.recompute_w_u_fwd(
        k=k,
        v=v,
@@ -71,17 +207,63 @@ def chunk_gated_delta_rule_fwd(q: torch.Tensor,
        g_cumsum=g,
        cu_seqlens=cu_seqlens,
        chunk_indices=chunk_indices,
-        chunk_size=64
+        chunk_size=64,
    )
-    h, v_new, final_state = chunk_gated_delta_rule_fwd_h(
+    """
+    w, u = recompute_w_u_fwd(
        k=k,
-        w=w,
-        u=u,
-        g=g,
-        initial_state=initial_state,
-        output_final_state=output_final_state,
+        v=v,
+        beta=beta,
+        A=A,
+        g_cumsum=g,
        cu_seqlens=cu_seqlens,
    )
+    """
+
+    # i
+    # import os
+    # if not os.path.exists("/qwen-next/in"):
+    #     os.makedirs("/qwen-next/in")
+    #     torch.save(k, "/qwen-next/in/k.pt")
+    #     torch.save(u, "/qwen-next/in/u.pt")
+    #     torch.save(w, "/qwen-next/in/w.pt")
+    #     torch.save(g, "/qwen-next/in/g.pt")
+    #     torch.save(initial_state, "/qwen-next/in/initial_state.pt")
+    #     torch.save(cu_seqlens, "/qwen-next/in/cu_seqlens.pt")
+    #     torch.save(chunk_indices, "/qwen-next/in/chunk_indices.pt")
+    #     torch.save(chunk_offsets.to(torch.int32), "/qwen-next/in/chunk_offsets.pt")
+    #     torch.save(chunk_size, "/qwen-next/in/chunk_size.pt")
+    #     torch.save(output_final_state, "/qwen-next/in/output_final_state.pt")
+
+    h, v_new, final_state = torch.ops.xspeedgate_ops.chunk_gated_delta_rule_fwd_h(
+        k,
+        u,
+        w,
+        g,
+        initial_state,
+        cu_seqlens,
+        chunk_indices,
+        chunk_offsets.to(torch.int32),
+        chunk_size,
+        output_final_state,
+        True,
+    )
+
+    # h, v_new, final_state = chunk_gated_delta_rule_fwd_h(
+    #     k=k,
+    #     w=w,
+    #     u=u,
+    #     g=g,
+    #     initial_state=initial_state,
+    #     output_final_state=output_final_state,
+    #     cu_seqlens=cu_seqlens,
+    # )
+    # if not os.path.exists("/qwen-next/out"):
+    #     os.makedirs("/qwen-next/out")
+    #     torch.save(h, "/qwen-next/out/h.pt")
+    #     torch.save(v_new, "/qwen-next/out/v_new.pt")
+    #     torch.save(final_state, "/qwen-next/out/final_state.pt")
+
    o = torch.ops.xspeedgate_ops.chunk_fwd_o(
        q=q,
        k=k,
@@ -91,8 +273,19 @@ def chunk_gated_delta_rule_fwd(q: torch.Tensor,
        scale=scale,
        cu_seqlens=cu_seqlens,
        chunk_indices=chunk_indices,
-        chunk_size=64
+        chunk_size=64,
    )
+    """
+    o = chunk_fwd_o(
+        q=q,
+        k=k,
+        v=v_new,
+        h=h,
+        g=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+    )
+    """
    if SUPPRESS_LEVEL < 3:
        return g, o, A, final_state, None, None, None
    elif SUPPRESS_LEVEL >= 3:
@@ -103,18 +296,20 @@ class ChunkGatedDeltaRuleFunction(torch.autograd.Function):

    @staticmethod
    @input_guard
-    @torch.amp.custom_fwd(device_type='cuda')
-    def forward(ctx,
-                q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                g: torch.Tensor,
-                beta: torch.Tensor,
-                scale: float,
-                initial_state: torch.Tensor,
-                output_final_state: bool,
-                cu_seqlens: Optional[torch.LongTensor] = None,
-                use_qk_l2norm_in_kernel: bool = False):
+    @torch.amp.custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
        if use_qk_l2norm_in_kernel:
            q = l2norm_fwd(q)
            k = l2norm_fwd(k)
@@ -136,17 +331,19 @@ class ChunkGatedDeltaRuleFunction(torch.autograd.Function):


@torch.compiler.disable
-def chunk_gated_delta_rule(q: torch.Tensor,
-                           k: torch.Tensor,
-                           v: torch.Tensor,
-                           g: torch.Tensor,
-                           beta: torch.Tensor,
-                           scale: float = None,
-                           initial_state: torch.Tensor = None,
-                           output_final_state: bool = False,
-                           cu_seqlens: Optional[torch.LongTensor] = None,
-                           head_first: bool = False,
-                           use_qk_l2norm_in_kernel: bool = False):
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+    use_qk_l2norm_in_kernel: bool = False,
+):
    r"""
    Args:
        q (torch.Tensor):
@@ -211,42 +408,85 @@ def chunk_gated_delta_rule(q: torch.Tensor,
        )
    """
    assert q.dtype == k.dtype == v.dtype
-    assert q.dtype != torch.float32, "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
-    assert len(
-        beta.shape
-    ) == 3, "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
+    assert (
+        q.dtype != torch.float32
+    ), "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
+    assert (
+        len(beta.shape) == 3
+    ), "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."

    if head_first:
        raise DeprecationWarning(
            "head_first is deprecated and will be removed in a future version. "
            "Please use head_first=False for now instead.",
-            stacklevel=2)
+            stacklevel=2,
+        )
        q, k, v, beta, g = map(
-            lambda x: rearrange(x, 'b h t ... -> b t h ...'),
-            (q, k, v, beta, g))
+            lambda x: rearrange(x, "b h t ... -> b t h ..."), (q, k, v, beta, g)
+        )
    if not head_first and q.shape[1] < q.shape[2]:
        warnings.warn(
            f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
            "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
            "when head_first=False was specified. "
            "Please verify your input tensor format matches the expected shape [B, T, H, ...].",
-            stacklevel=2)
+            stacklevel=2,
+        )
    if cu_seqlens is not None:
        if q.shape[0] != 1:
            raise ValueError(
                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
-                f"Please flatten variable-length inputs before processing.")
-        if initial_state is not None and initial_state.shape[0] != len(
-                cu_seqlens) - 1:
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
            raise ValueError(
                f"The number of initial states is expected to be equal to the number of input sequences, "
                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
            )
    if scale is None:
-        scale = k.shape[-1]**-0.5
-    o, final_state = ChunkGatedDeltaRuleFunction.apply(
-        q, k, v, g, beta, scale, initial_state, output_final_state, cu_seqlens,
-        use_qk_l2norm_in_kernel)
+        scale = k.shape[-1] ** -0.5
+
+    if False:
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        g = g.contiguous()
+        beta = beta.contiguous()
+        initial_state = initial_state.contiguous()
+
+        o = torch.empty_like(v)
+        final_state = torch.empty_like(initial_state)
+        import kunlun_ops
+
+        kunlun_ops.gated_delta_rule(
+            q,
+            k,
+            v,
+            initial_state,
+            g,
+            beta,
+            final_state,
+            o,
+            scale,
+            cu_seqlens.cpu(),
+            cu_seqlens,
+            cu_seqlens.cpu(),
+            cu_seqlens,
+            use_qk_l2norm_in_kernel=True,
+        )
+    else:
+        o, final_state = ChunkGatedDeltaRuleFunction.apply(
+            q,
+            k,
+            v,
+            g,
+            beta,
+            scale,
+            initial_state,
+            output_final_state,
+            cu_seqlens,
+            use_qk_l2norm_in_kernel,
+        )
    if head_first:
-        o = rearrange(o, 'b t h ... -> b h t ...')
+        o = rearrange(o, "b t h ... -> b h t ...")
    return o, final_state
--- a/vllm_kunlun/ops/fla/chunk_o.py
+++ b/vllm_kunlun/ops/fla/chunk_o.py
@@ -12,21 +12,21 @@
 from typing import Optional

 import torch
-
 from vllm.triton_utils import tl, triton

 from .index import prepare_chunk_indices
-from .op import exp
 from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper

 BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
 NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]


-@triton.heuristics({
-    'USE_G': lambda args: args['g'] is not None,
-    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
-})
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
 # @triton.autotune(
 #     configs=[
 #         triton.Config({
@@ -40,7 +40,7 @@ NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]
 #     ],
 #     key=['H', 'K', 'V', 'BT'],
 # )
-@triton.jit(do_not_specialize=['T'])
+@triton.jit(do_not_specialize=["T"])
 def chunk_fwd_kernel_o(
    q,
    k,
@@ -67,10 +67,12 @@ def chunk_fwd_kernel_o(

    if IS_VARLEN:
        i_tg = i_t
-        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
-            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(
-            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
        T = eos - bos
        NT = tl.cdiv(T, BT)
    else:
@@ -89,12 +91,15 @@ def chunk_fwd_kernel_o(
    b_A = tl.zeros([BT, BT], dtype=tl.float32)

    for i_k in range(tl.cdiv(K, BK)):
-        p_q = tl.make_block_ptr(q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK),
-                                (BT, BK), (1, 0))
-        p_k = tl.make_block_ptr(k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT),
-                                (BK, BT), (0, 1))
-        p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV),
-                                (BK, BV), (1, 0))
+        p_q = tl.make_block_ptr(
+            q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)
+        )
+        p_k = tl.make_block_ptr(
+            k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)
+        )
+        p_h = tl.make_block_ptr(
+            h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)
+        )
        # [BT, BK]
        b_q = tl.load(p_q, boundary_check=(0, 1))
        # [BK, BT]
@@ -109,8 +114,8 @@ def chunk_fwd_kernel_o(

    if USE_G:
        g += bos * H + i_h
-        p_g = tl.make_block_ptr(g, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, ))
-        b_g = tl.load(p_g, boundary_check=(0, ))
+        p_g = tl.make_block_ptr(g, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_g = tl.load(p_g, boundary_check=(0,))
        b_o = b_o * tl.exp(b_g)[:, None]
        b_A = b_A * tl.exp(b_g[:, None] - b_g[None, :])

@@ -120,10 +125,12 @@ def chunk_fwd_kernel_o(
    # b_A = tl.where(m_A, b_A, 0)
    b_A = tl.where(o_t[:, None] >= o_t[None, :], b_A, 0)

-    p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV),
-                            (BT, BV), (1, 0))
-    p_o = tl.make_block_ptr(o, (T, V), (H * V, 1), (i_t * BT, i_v * BV),
-                            (BT, BV), (1, 0))
+    p_v = tl.make_block_ptr(
+        v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+    )
+    p_o = tl.make_block_ptr(
+        o, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+    )
    b_v = tl.load(p_v, boundary_check=(0, 1))

    # to fix mma -> mma layout conversion
@@ -133,48 +140,29 @@ def chunk_fwd_kernel_o(


 def chunk_fwd_o(
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        h: torch.Tensor,
-        g: Optional[torch.Tensor] = None,  # cumsum of log decay
-        scale: Optional[float] = None,
-        cu_seqlens: Optional[torch.LongTensor] = None,
-        chunk_size: int = 64) -> torch.Tensor:
-    B, T, Hg, K, V = *q.shape, v.shape[-1]
-    H = v.shape[-2]
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    h: torch.Tensor,
+    g: Optional[torch.Tensor] = None,  # cumsum of log decay
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64,
+) -> torch.Tensor:
+    _, T, _, _, _ = *q.shape, v.shape[-1]
    if FLA_GDN_FIX_BT:
        BT = 64
    else:
        BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, BT) if cu_seqlens is not None else None
-    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
    if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5

    o = torch.empty_like(v)

-    def grid(meta):
-        return (triton.cdiv(V, meta['BV']), NT, B * H)
-
-    chunk_fwd_kernel_o[grid](
-        q,
-        k,
-        v,
-        h,
-        g,
-        o,
-        cu_seqlens,
-        chunk_indices,
-        scale,
-        T=T,
-        H=H,
-        Hg=Hg,
-        K=K,
-        V=V,
-        BT=BT,
-        BK=64,
-        BV=32
+    o = torch.ops.xspeedgate_ops.chunk_fwd_o(
+        q, k, v, h, g, scale, cu_seqlens, chunk_indices, chunk_size
    )
    return o
--- a/vllm_kunlun/ops/fla/fused_recurrent.py
+++ b/vllm_kunlun/ops/fla/fused_recurrent.py
@@ -9,28 +9,28 @@
 # ruff: noqa: E501
 from typing import Optional

-import torch
-
 import kunlun_ops
+import torch


 class FusedRecurrentFunction(torch.autograd.Function):

    @staticmethod
-    def forward(ctx,
-                q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                g: torch.Tensor,
-                beta: torch.Tensor,
-                scale: float,
-                initial_state: torch.Tensor,
-                inplace_final_state: bool = True,
-                cu_seqlens: Optional[torch.LongTensor] = None,
-                ssm_state_indices: Optional[torch.Tensor] = None,
-                num_accepted_tokens: Optional[torch.Tensor] = None,
-                use_qk_l2norm_in_kernel: bool = False):
-        
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        inplace_final_state: bool = True,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        ssm_state_indices: Optional[torch.Tensor] = None,
+        num_accepted_tokens: Optional[torch.Tensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
        o, final_state = kunlun_ops.fused_recurrent_gated_delta_rule_fwdv2(
            q.contiguous(),
            k.contiguous(),
@@ -44,7 +44,7 @@ class FusedRecurrentFunction(torch.autograd.Function):
            h0_indices=ssm_state_indices,
            num_accepted_tokens=num_accepted_tokens,
            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
-            is_h0_transposed=True
+            is_h0_transposed=True,
        )
        return o, final_state

@@ -130,9 +130,10 @@ def fused_recurrent_gated_delta_rule(
    if cu_seqlens is not None and q.shape[0] != 1:
        raise ValueError(
            f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
-            f"Please flatten variable-length inputs before processing.")
+            f"Please flatten variable-length inputs before processing."
+        )
    if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
    else:
        assert scale > 0, "scale must be positive"
    if beta is None:
--- a/vllm_kunlun/ops/fla/l2norm.py
+++ b/vllm_kunlun/ops/fla/l2norm.py
@@ -10,22 +10,21 @@
 import os
 from typing import Optional

+import kunlun_ops
 import torch
 from vllm.triton_utils import tl, triton

-import kunlun_ops
-
-
 BT_LIST = [8, 16, 32, 64, 128]

 USE_DEFAULT_FLA_NORM = int(os.getenv("USE_DEFAULT_FLA_NORM", "0"))


-@triton.autotune(configs=[
-    triton.Config({}, num_warps=num_warps)
-    for num_warps in [1, 2, 4, 8, 16, 32]
-],
-                 key=['D'])
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8, 16, 32]
+    ],
+    key=["D"],
+)
@triton.jit
 def l2norm_fwd_kernel1(
    x,
@@ -49,11 +48,14 @@ def l2norm_fwd_kernel1(
    tl.store(y + cols, b_y, mask=mask)


-@triton.autotune(configs=[
-    triton.Config({'BT': BT}, num_warps=num_warps)
-    for num_warps in [1, 2, 4, 8, 16] for BT in BT_LIST
-],
-                 key=['D'])
+@triton.autotune(
+    configs=[
+        triton.Config({"BT": BT}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8, 16]
+        for BT in BT_LIST
+    ],
+    key=["D"],
+)
@triton.jit(do_not_specialize=["NB"])
 def l2norm_fwd_kernel(
    x,
@@ -87,67 +89,9 @@ def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr):
    tl.store(Y + (rindex + N * row_idx), xs * rsqrt, xmask)


-def l2norm_fwd_triton(x: torch.Tensor,
-               eps: float = 1e-6,
-               output_dtype: Optional[torch.dtype] = None):
-    x_shape_og = x.shape
-    x = x.view(-1, x.shape[-1])
-    # allocate output
-    if output_dtype is None:
-        y = torch.empty_like(x)
-    else:
-        y = torch.empty_like(x, dtype=output_dtype)
-    assert y.stride(-1) == 1
-    T, D = x.shape[0], x.shape[-1]
-    # rstd = torch.empty((T,), dtype=torch.float32, device=x.device)
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BD = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
-    if D > BD:
-        raise RuntimeError("This layer doesn't support feature dim >= 64KB.")
-
-    if not USE_DEFAULT_FLA_NORM:
-        MBLOCK = 32
-        # M, N = x.shape
-        l2norm_fwd_kernel2[(triton.cdiv(T, MBLOCK), )](
-            x,
-            y,
-            eps,
-            T,
-            D,
-            MBLOCK,
-        )
-    else:
-        if D <= 512:
-            NB = triton.cdiv(T, 2048)
-
-            def grid(meta):
-                return (triton.cdiv(T, meta['BT']), )
-
-            l2norm_fwd_kernel[grid](
-                x,
-                y,
-                eps,
-                NB=NB,
-                T=T,
-                D=D,
-                BD=BD,
-            )
-        else:
-            l2norm_fwd_kernel1[(T, )](
-                x,
-                y,
-                eps=eps,
-                D=D,
-                BD=BD,
-            )
-
-    return y.view(x_shape_og)
-
-
-def l2norm_fwd(x: torch.Tensor,
-               eps: float = 1e-6,
-               output_dtype: Optional[torch.dtype] = None):
+def l2norm_fwd(
+    x: torch.Tensor, eps: float = 1e-6, output_dtype: Optional[torch.dtype] = None
+):
    out = torch.empty_like(x)
-    kunlun_ops.l2norm(x, out, eps)                                                                                                                                                                                                                              
+    kunlun_ops.l2norm(x, out, eps)
    return out
--- a/vllm_kunlun/ops/fla/layernorm_guard.py
+++ b/vllm_kunlun/ops/fla/layernorm_guard.py
@@ -19,20 +19,21 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-
 from vllm.triton_utils import tl, triton

 from .utils import input_guard


-def rms_norm_ref(x,
-                 weight,
-                 bias,
-                 z=None,
-                 eps=1e-6,
-                 group_size=None,
-                 norm_before_gate=True,
-                 upcast=True):
+def rms_norm_ref(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    upcast=True,
+):
    dtype = x.dtype
    weight = weight.float()
    bias = bias.float() if bias is not None else None
@@ -43,12 +44,10 @@ def rms_norm_ref(x,
        x = x * F.silu(z)
    if group_size is None:
        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-        out = (x * rstd * weight) + bias if bias is not None else (x * rstd *
-                                                                   weight)
+        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
    else:
        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
-        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) +
-                              eps)
+        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
        if bias is not None:
            out = out + bias
@@ -57,10 +56,12 @@ def rms_norm_ref(x,
    return out.to(dtype)


-@triton.heuristics({
-    "HAS_BIAS": lambda args: args["B"] is not None,
-    "HAS_Z": lambda args: args["Z"] is not None,
-})
+@triton.heuristics(
+    {
+        "HAS_BIAS": lambda args: args["B"] is not None,
+        "HAS_Z": lambda args: args["Z"] is not None,
+    }
+)
@triton.jit
 def layer_norm_fwd_kernel(
    X,  # pointer to the input
@@ -97,17 +98,17 @@ def layer_norm_fwd_kernel(
        B += group * N
    # Compute mean and variance
    cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
+    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
    if HAS_Z and not NORM_BEFORE_GATE:
        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
        x *= z * tl.sigmoid(z)
    if not IS_RMS_NORM:
        mean = tl.sum(x, axis=0) / N
        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.)
+        xbar = tl.where(cols < N, x - mean, 0.0)
        var = tl.sum(xbar * xbar, axis=0) / N
    else:
-        xbar = tl.where(cols < N, x, 0.)
+        xbar = tl.where(cols < N, x, 0.0)
        var = tl.sum(xbar * xbar, axis=0) / N
    rstd = 1 / tl.sqrt(var + eps)
    tl.store(Rstd + row, rstd)
@@ -149,46 +150,50 @@ def layer_norm_fwd(
    #     weight = weight.reshape(N)
    # print("weight",weight.shape)
    # print("x",x.shape)
-    assert weight.shape == (N, )
+    assert weight.shape == (N,)
    assert weight.stride(-1) == 1
    if bias is not None:
        assert bias.stride(-1) == 1
-        assert bias.shape == (N, )
+        assert bias.shape == (N,)
    # allocate output
    if out is not None:
        assert out.shape == x.shape
    else:
        out = torch.empty_like(x)
    assert out.stride(-1) == 1
-    mean = torch.empty((ngroups * M, ), dtype=torch.float32,
-                       device=x.device) if not is_rms_norm else None
-    rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
+    mean = (
+        torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
+    rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
    # Less than 64KB per feature: enqueue fused kernel
    MAX_FUSED_SIZE = 65536 // x.element_size()
    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
    if group_size > BLOCK_N:
-        raise RuntimeError(
-            "This layer norm doesn't support feature dim >= 64KB.")
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
    # heuristics for number of warps
    num_warps = min(max(BLOCK_N // 256, 1), 8)
    grid = (M, ngroups)
-    layer_norm_fwd_kernel[grid](x,
-                                out,
-                                weight,
-                                bias,
-                                z,
-                                mean,
-                                rstd,
-                                x.stride(0),
-                                out.stride(0),
-                                z.stride(0) if z is not None else 0,
-                                M,
-                                group_size,
-                                eps,
-                                BLOCK_N=BLOCK_N,
-                                NORM_BEFORE_GATE=norm_before_gate,
-                                IS_RMS_NORM=is_rms_norm,
-                                num_warps=num_warps)
+    layer_norm_fwd_kernel[grid](
+        x,
+        out,
+        weight,
+        bias,
+        z,
+        mean,
+        rstd,
+        x.stride(0),
+        out.stride(0),
+        z.stride(0) if z is not None else 0,
+        M,
+        group_size,
+        eps,
+        BLOCK_N=BLOCK_N,
+        NORM_BEFORE_GATE=norm_before_gate,
+        IS_RMS_NORM=is_rms_norm,
+        num_warps=num_warps,
+    )
    return out, mean, rstd


@@ -196,17 +201,18 @@ class LayerNormFn(torch.autograd.Function):

    @input_guard
    @staticmethod
-    def forward(ctx,
-                x,
-                weight,
-                bias,
-                z=None,
-                eps=1e-6,
-                group_size=None,
-                norm_before_gate=True,
-                is_rms_norm=False):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        z=None,
+        eps=1e-6,
+        group_size=None,
+        norm_before_gate=True,
+        is_rms_norm=False,
+    ):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""

        x_shape_og = x.shape
        # reshape input data into 2D tensor
@@ -223,16 +229,15 @@ class LayerNormFn(torch.autograd.Function):
        weight = weight.contiguous()
        if bias is not None:
            bias = bias.contiguous()
-        y, mean, rstd = layer_norm_fwd(
-            x,
-            weight,
-            bias,
-            eps,
-            z=z,
-            group_size=group_size,
-            norm_before_gate=norm_before_gate,
-            is_rms_norm=is_rms_norm,
+        # y, mean, rstd = torch.ops.xspeedgate_ops.rms_norm_gated_fwd(x, weight, bias, eps, z, group_size, norm_before_gate, is_rms_norm)
+        y = torch.empty_like(x)
+        mean, rstd = None, None
+        import kunlun_ops
+
+        kunlun_ops.rms_norm_gated(
+            x, y, z, weight, eps, group_size, norm_before_gate, is_rms_norm
        )
+
        ctx.save_for_backward(x, weight, bias, mean, rstd, z)
        ctx.x_shape_og = x_shape_og
        ctx.eps = eps
@@ -242,27 +247,27 @@ class LayerNormFn(torch.autograd.Function):
        return y.reshape(x_shape_og)


-def layernorm_fn(x,
-                 weight,
-                 bias,
-                 z=None,
-                 eps=1e-6,
-                 group_size=None,
-                 norm_before_gate=True,
-                 is_rms_norm=False):
-    return LayerNormFn.apply(x, weight, bias, z, eps, group_size,
-                             norm_before_gate, is_rms_norm)
+def layernorm_fn(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    return LayerNormFn.apply(
+        x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm
+    )


-def rmsnorm_fn(x,
-               weight,
-               bias,
-               z=None,
-               eps=1e-6,
-               group_size=None,
-               norm_before_gate=True):
-    return LayerNormFn.apply(x, weight, bias, z, eps, group_size,
-                             norm_before_gate, True)
+def rmsnorm_fn(
+    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True
+):
+    return LayerNormFn.apply(
+        x, weight, bias, z, eps, group_size, norm_before_gate, True
+    )


 class LayerNormGated(nn.Module):
@@ -294,15 +299,16 @@ class LayerNormGated(nn.Module):
        torch.nn.init.zeros_(self.bias)

    def forward(self, x, z=None):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
-        return layernorm_fn(x,
-                            self.weight,
-                            self.bias,
-                            z=z,
-                            group_size=self.group_size,
-                            eps=self.eps,
-                            norm_before_gate=self.norm_before_gate)
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+        return layernorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            group_size=self.group_size,
+            eps=self.eps,
+            norm_before_gate=self.norm_before_gate,
+        )


 class RMSNormGated(nn.Module):
@@ -332,12 +338,13 @@ class RMSNormGated(nn.Module):
        torch.nn.init.ones_(self.weight)

    def forward(self, x, z=None):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
-        return rmsnorm_fn(x,
-                          self.weight,
-                          self.bias,
-                          z=z,
-                          eps=self.eps,
-                          group_size=self.group_size,
-                          norm_before_gate=self.norm_before_gate)
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+        return rmsnorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            eps=self.eps,
+            group_size=self.group_size,
+            norm_before_gate=self.norm_before_gate,
+        )
--- a/vllm_kunlun/ops/fla/wy_fast.py
+++ b/vllm_kunlun/ops/fla/wy_fast.py
@@ -11,7 +11,6 @@
 from typing import Optional

 import torch
-
 from vllm.triton_utils import tl, triton

 from .index import prepare_chunk_indices
@@ -28,6 +27,7 @@ RESOLUTION = {
    torch.complex64: 1.3e-6,
 }

+
 def assert_close(res, ref, dtype, equal_nan=False, reduce_dim=1):
    assert res.dtype == dtype
    ref = ref.to(dtype)
@@ -35,6 +35,7 @@ def assert_close(res, ref, dtype, equal_nan=False, reduce_dim=1):
    rtol = RESOLUTION[dtype]
    torch.testing.assert_close(res, ref, atol=atol, rtol=rtol, equal_nan=equal_nan)

+
@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
 # @triton.autotune(
 #     configs=[
@@ -80,7 +81,6 @@ def recompute_u_fwd_kernel(
    p_beta = tl.make_block_ptr(
        beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
    )
-    p_g = tl.make_block_ptr(g + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
    p_A = tl.make_block_ptr(
        A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)
    )
@@ -110,7 +110,6 @@ def recompute_u_fwd_kernel(
        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))


-
@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
 # @triton.autotune(
 #     configs=[
@@ -195,53 +194,12 @@ def recompute_w_u_fwd(
    A: torch.Tensor,
    cu_seqlens: Optional[torch.LongTensor],
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    B, T, Hg, K, V = *k.shape, v.shape[-1]
-    H = v.shape[-2]
    BT = A.shape[-1]

-    chunk_indices = prepare_chunk_indices(
-        cu_seqlens, BT) if cu_seqlens is not None else None
-    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
-    BK = 64
-    BV = 64
-    u = torch.empty_like(v)
-    w = k.new_empty(B, T, H, K)
-    recompute_u_fwd_kernel[(NT, B * H)](
-        k=k,
-        v=v,
-        beta=beta,
-        w=w,
-        u=u,
-        A=A,
-        g=g_cumsum,
-        cu_seqlens=cu_seqlens,
-        chunk_indices=chunk_indices,
-        T=T,
-        H=H,
-        Hg=Hg,
-        K=K,
-        V=V,
-        BT=BT,
-        BK=BK,
-        BV=BV,
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
    )
-    recompute_w_fwd_kernel[(NT, B * H)](
-        k=k,
-        v=v,
-        beta=beta,
-        w=w,
-        u=u,
-        A=A,
-        g=g_cumsum,
-        cu_seqlens=cu_seqlens,
-        chunk_indices=chunk_indices,
-        T=T,
-        H=H,
-        Hg=Hg,
-        K=K,
-        V=V,
-        BT=BT,
-        BK=BK,
-        BV=BV,
+    w, u = torch.ops.xspeedgate_ops.recompute_w_u_fwd(
+        k, v, beta, g_cumsum, A, cu_seqlens, chunk_indices, chunk_size=BT
    )
-    return w, u
+    return w, u