[Quantization] register AscendQuantRMSNorm for quantization (#2856)

### What this PR does / why we need it? modelslim will generate self.bias for rms norm in quantization, since RMSNorm in vllm has no this parameter, so its nesscesary to create a AscendQuantRmsNorm. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? tested by deepseek-v3.1-w8a8 <img width="2496" height="592" alt="image" src="https://github.com/user-attachments/assets/004c6e76-3d7a-4a1f-b59f-a14304012663" /> - vLLM version: main - vLLM main: d6249d0699 Signed-off-by: 22dimensions <waitingwind@foxmail.com>
2025-09-11 23:14:02 +08:00
parent eab3635850
commit f5a97e8fa5
4 changed files with 35 additions and 7 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -24,7 +24,7 @@ import os
 from contextlib import contextmanager
 from enum import Enum
 from threading import Lock
-from typing import TYPE_CHECKING, List, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple

 import torch
 import torch_npu  # noqa: F401  # noqa: F401
@@ -483,7 +483,7 @@ def get_all_reduce_merge_state(ep_size: int, is_deepseek_v3_r1: bool):
    return False


-def register_ascend_customop():
+def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
    """Register Ascend CustomOP

    NOTE: if the register branch requires model type, please use `vllm.config.get_current_vllm_config`, 
@@ -497,7 +497,7 @@ def register_ascend_customop():
    from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
    from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul
    from vllm_ascend.ops.common_fused_moe import AscendFusedMoE
-    from vllm_ascend.ops.layernorm import AscendRMSNorm
+    from vllm_ascend.ops.layernorm import AscendQuantRMSNorm, AscendRMSNorm
    from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
                                        AscendMergedColumnParallelLinear,
                                        AscendQKVParallelLinear,
@@ -526,6 +526,11 @@ def register_ascend_customop():
        "MultiHeadLatentAttention": AscendMultiHeadLatentAttention,
    }

+    if vllm_config is not None and \
+        vllm_config.quant_config is not None and \
+        any("norm.bias" in name for name in vllm_config.quant_config.quant_description.keys()):
+        REGISTERED_ASCEND_OPS["RMSNorm"] = AscendQuantRMSNorm
+
    for name, op_cls in REGISTERED_ASCEND_OPS.items():
        CustomOp.register_oot(_decorated_op_cls=op_cls, name=name)