[Model] Optimizing gemma3 model's GemmaRMSNorm function (#3151)

### What this PR does / why we need it? Before optimizing，the rmsnorm time in one decoding is 531.5us. After optimizing，the rmsnorm time in one decoding is 105us. I closed the previous PR（https://github.com/vllm-project/vllm-ascend/pull/2456） by mistake and resubmitted it now ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main: b1068903fd --------- Signed-off-by: socrahow <suzihao4@h-partners.com>
2025-09-28 21:19:10 +08:00
parent dd56e9306b
commit c3fee66806
2 changed files with 31 additions and 2 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -505,7 +505,8 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
    from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
    from vllm_ascend.ops.common_fused_moe import (AscendFusedMoE,
                                                  AscendSharedFusedMoE)
-    from vllm_ascend.ops.layernorm import AscendQuantRMSNorm, AscendRMSNorm
+    from vllm_ascend.ops.layernorm import (AscendGemmaRMSNorm,
+                                           AscendQuantRMSNorm, AscendRMSNorm)
    from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
                                        AscendMergedColumnParallelLinear,
                                        AscendQKVParallelLinear,
@@ -530,6 +531,7 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
        "ParallelLMHead": AscendParallelLMHead,
        "LogitsProcessor": AscendLogitsProcessor,
        "RMSNorm": AscendRMSNorm,
+        "GemmaRMSNorm": AscendGemmaRMSNorm,
        "FusedMoE": AscendFusedMoE,
        "SharedFusedMoE": AscendSharedFusedMoE,
        "MultiHeadLatentAttention": AscendMultiHeadLatentAttention,