[Feature] Merge branch 'Qwen3-Next' into main && Support Qwen-next (#222)

Signed-off-by: xyDong0223 <dongxinyu03@baidu.com> Co-authored-by: xyDong0223 <dongxinyu03@baidu.com>
2026-02-28 11:15:50 +08:00
parent 153093d3b3
commit 82544aa0cc
17 changed files with 2668 additions and 1532 deletions
--- a/vllm_kunlun/ops/fla/fused_recurrent.py
+++ b/vllm_kunlun/ops/fla/fused_recurrent.py
@@ -9,28 +9,28 @@
 # ruff: noqa: E501
 from typing import Optional

-import torch
-
 import kunlun_ops
+import torch


 class FusedRecurrentFunction(torch.autograd.Function):

    @staticmethod
-    def forward(ctx,
-                q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                g: torch.Tensor,
-                beta: torch.Tensor,
-                scale: float,
-                initial_state: torch.Tensor,
-                inplace_final_state: bool = True,
-                cu_seqlens: Optional[torch.LongTensor] = None,
-                ssm_state_indices: Optional[torch.Tensor] = None,
-                num_accepted_tokens: Optional[torch.Tensor] = None,
-                use_qk_l2norm_in_kernel: bool = False):
-        
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        inplace_final_state: bool = True,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        ssm_state_indices: Optional[torch.Tensor] = None,
+        num_accepted_tokens: Optional[torch.Tensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
        o, final_state = kunlun_ops.fused_recurrent_gated_delta_rule_fwdv2(
            q.contiguous(),
            k.contiguous(),
@@ -44,7 +44,7 @@ class FusedRecurrentFunction(torch.autograd.Function):
            h0_indices=ssm_state_indices,
            num_accepted_tokens=num_accepted_tokens,
            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
-            is_h0_transposed=True
+            is_h0_transposed=True,
        )
        return o, final_state

@@ -130,9 +130,10 @@ def fused_recurrent_gated_delta_rule(
    if cu_seqlens is not None and q.shape[0] != 1:
        raise ValueError(
            f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
-            f"Please flatten variable-length inputs before processing.")
+            f"Please flatten variable-length inputs before processing."
+        )
    if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
    else:
        assert scale > 0, "scale must be positive"
    if beta is None: