[main] Use AddRmsNormQuant ops in the custom model to optimize Qwen3's performance (#1806)

### What this PR does / why we need it?
Optimizes the performance of the Qwen3 quantization model by registering
a custom model and adding the AddRmsNormQuant operation. Subsequent PRs
will focus on performance optimizations based on this custom model.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
CI passed with existing test.

- vLLM version: v0.9.2
- vLLM main:
8d0a01a5f2

Signed-off-by: rjg-lyh <1318825571@qq.com>
This commit is contained in:
rjg-lyh
2025-07-22 19:03:13 +08:00
committed by GitHub
parent ce4970eee0
commit 9a3bdf2162
5 changed files with 227 additions and 8 deletions

View File

@@ -23,6 +23,43 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from vllm_ascend.utils import is_310p
class AddRMSNormW8A8Quant(RMSNorm):
# Fuse AddRmsNorm and W8A8 quantization ops together
def __init__(
self,
hidden_size: int,
layer: torch.nn.Module,
eps: float = 1e-6,
var_hidden_size: Optional[int] = None,
has_weight: bool = True,
dtype: Optional[torch.dtype] = None,
) -> None:
super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
self.layer = layer
def forward(
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
import torch_npu
if residual is not None:
x, _, residual = torch_npu.npu_add_rms_norm_quant(
x,
residual,
self.weight,
self.layer.aclnn_input_scale,
self.layer.aclnn_input_offset,
epsilon=self.variance_epsilon)
return x, residual
x, residual = torch_npu.npu_rms_norm(x, self.weight,
self.variance_epsilon)
return x
def forward_oot(
self,
x: torch.Tensor,