[Feature] Merge branch 'Qwen3-Next' into main && Support Qwen-next (#222)

Signed-off-by: xyDong0223 <dongxinyu03@baidu.com> Co-authored-by: xyDong0223 <dongxinyu03@baidu.com>
2026-02-28 11:15:50 +08:00
parent 153093d3b3
commit 82544aa0cc
17 changed files with 2668 additions and 1532 deletions
--- a/vllm_kunlun/ops/layernorm.py
+++ b/vllm_kunlun/ops/layernorm.py
@@ -15,51 +15,52 @@
 # This file is a part of the vllm-ascend project.
 #

-import torch
-
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.layernorm import GemmaRMSNorm as OriGemmaRMSNorm
-from vllm.model_executor.layers import layernorm
 from typing import Optional, Union
-import kunlun_ops
+
+import torch
+from vllm.model_executor.layers import layernorm
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm as OriGemmaRMSNorm
+from vllm.model_executor.layers.layernorm import RMSNorm
+

 def vllm_kunlun_forward_cuda(
-        self,
-        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        """forward_cuda"""
-        if x.is_contiguous() == False:
-            # kunlun does not support uncontiguous input and they do not think it is a bug 
-            # so we must make it contiguous() manually
-            x = x.contiguous()
-        if self.variance_size_override is not None:
-            return self.forward_native(x, residual)
+    self,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    """forward_cuda"""
+    if not x.is_contiguous():
+        # kunlun does not support uncontiguous input and they do not think it is a bug
+        # so we must make it contiguous() manually
+        x = x.contiguous()
+    if self.variance_size_override is not None:
+        return self.forward_native(x, residual)

-        
-        if residual is not None:
-            # residual_output = torch.empty_like(residual)
-            torch.ops._C.add_rmsnorm(
-                x,
-                residual,
-                residual_output=residual,
-                weight=self.weight.data,
-                eps=self.variance_epsilon,
-                output=x
-            )
-            return x, residual
-        out = torch.empty_like(x)
-        torch.ops._C.rmsnorm(
+    if residual is not None:
+        # residual_output = torch.empty_like(residual)
+        torch.ops._C.add_rmsnorm(
            x,
-            self.weight.data,
-            out,
-            self.variance_epsilon,
+            residual,
+            residual_output=residual,
+            weight=self.weight.data,
+            eps=self.variance_epsilon,
+            output=x,
        )
-        return out
+        return x, residual
+    out = torch.empty_like(x)
+    torch.ops._C.rmsnorm(
+        x,
+        self.weight.data,
+        out,
+        self.variance_epsilon,
+    )
+    return out
+

 RMSNorm.forward_cuda = vllm_kunlun_forward_cuda
 RMSNorm.forward = vllm_kunlun_forward_cuda

+
 class KunlunGemmaRMSNorm(OriGemmaRMSNorm):
    @staticmethod
    def forward_xpu(
@@ -68,30 +69,42 @@ class KunlunGemmaRMSNorm(OriGemmaRMSNorm):
        x: torch.Tensor,
        residual: Optional[torch.Tensor],
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        if x.is_contiguous() == False:
+        if not x.is_contiguous():
            # kunlun does not support uncontiguous input and they do not think it is a bug
            # so we must make it contiguous() manually
            x = x.contiguous()
-
+        if x.dim() == 3:
+            x_shape = x.shape
+            x = x.view(-1, x.size(-1))
        if residual is not None:
-            torch.ops._C.add_rmsnorm(
+            out = torch.empty_like(x)
+            out_residual = torch.empty_like(residual)
+            torch.ops._C.gemma_add_rmsnorm(
                x,
                residual,
-                residual_output=residual,
-                weight=weight+1,
+                residual_output=out_residual,
+                weight=weight,
                eps=variance_epsilon,
-                output=x
+                output=out,
+            )
+        else:
+            out = torch.empty_like(x)
+            torch.ops._C.gemma_rmsnorm(
+                x,
+                weight,
+                out,
+                variance_epsilon,
            )
-            return x, residual

-        out = torch.empty_like(x)
-        torch.ops._C.rmsnorm(
-            x,
-            weight+1,
-            out,
-            variance_epsilon,
-        )
-        return out
+        if x.dim() == 3:
+            x = x.view(x_shape)
+            if out is not None:
+                out = out.view(x_shape)
+
+        if residual is not None:
+            return out, out_residual
+        else:
+            return out

    def forward_cuda(
        self,
@@ -99,16 +112,17 @@ class KunlunGemmaRMSNorm(OriGemmaRMSNorm):
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        if torch.compiler.is_compiling():
-            self.forward_static = self.forward_xpu # only use in cudagraph
+            self.forward_static = self.forward_xpu  # only use in cudagraph
            return self.forward_native(x, residual)

        if not getattr(self, "_is_compiled", False):
            self.forward_static = torch.compile(  # type: ignore
-                self.forward_static, backend="aot_eager")
+                self.forward_static, backend="aot_eager"
+            )
            self._is_compiled = True
        return self.forward_native(x, residual)


 RMSNorm.forward_cuda = vllm_kunlun_forward_cuda
 RMSNorm.forward = vllm_kunlun_forward_cuda
-layernorm.GemmaRMSNorm = KunlunGemmaRMSNorm
+layernorm.GemmaRMSNorm = KunlunGemmaRMSNorm