[Ops] Add layernorm for qwen3Next (#5765)

### What this PR does / why we need it? Add layernormFn triton op for qwen3Next model for better performance. <img width="248" height="526" alt="image" src="https://github.com/user-attachments/assets/27b47157-5df5-4db1-aa88-1dae799b2bf6" /> ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef --------- Signed-off-by: SunnyLee219 <3294305115@qq.com>
2026-01-20 14:43:14 +08:00
parent 0664c6e67a
commit 55b20ac63b
4 changed files with 254 additions and 4 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -669,7 +669,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):

    from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
    from vllm_ascend.ops.fused_moe.fused_moe import AscendFusedMoE, AscendSharedFusedMoE
-    from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm
+    from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm, AscendRMSNormGated
    from vllm_ascend.ops.linear import (
        AscendColumnParallelLinear,
        AscendMergedColumnParallelLinear,
@@ -715,6 +715,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None):
        "MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention,
        "MMEncoderAttention": AscendMMEncoderAttention,
        "ApplyRotaryEmb": AscendApplyRotaryEmb,
+        "RMSNormGated": AscendRMSNormGated,
    }

    # 310P: override selected ops with 310P implementations (keep minimal changes outside _310p)