[Quantization] register AscendQuantRMSNorm for quantization (#2856)

### What this PR does / why we need it?

modelslim will generate self.bias for rms norm in quantization, since
RMSNorm in vllm has no this parameter, so its nesscesary
to create a AscendQuantRmsNorm.
### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

tested by deepseek-v3.1-w8a8

<img width="2496" height="592" alt="image"
src="https://github.com/user-attachments/assets/004c6e76-3d7a-4a1f-b59f-a14304012663"
/>


- vLLM version: main
- vLLM main:
d6249d0699

Signed-off-by: 22dimensions <waitingwind@foxmail.com>
This commit is contained in:
22dimensions
2025-09-11 23:14:02 +08:00
committed by GitHub
parent eab3635850
commit f5a97e8fa5
4 changed files with 35 additions and 7 deletions

View File

@@ -15,7 +15,7 @@
# This file is a part of the vllm-ascend project.
#
from typing import Optional, Tuple, Union
from typing import Optional, Tuple, Union, cast
import torch
from vllm.model_executor.layers.layernorm import RMSNorm
@@ -89,3 +89,28 @@ class AscendRMSNorm(RMSNorm):
x, residual = torch_npu.npu_rms_norm(x, self.weight,
self.variance_epsilon)
return x
class AscendQuantRMSNorm(AscendRMSNorm):
def __init__(
self,
hidden_size: int,
eps: float = 1e-6,
var_hidden_size: Optional[int] = None,
has_weight: bool = True,
dtype: Optional[torch.dtype] = None,
) -> None:
super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
requires_grad=False)
def forward_oot(
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
if residual is not None:
x, residual = super().forward_oot(x, residual)
return x.add_(self.bias), residual
return cast(torch.Tensor, super().forward_oot(x)).add_(self.bias)