Update torch-npu version to 2.7.1 (#3896)

### What this PR does / why we need it?
Upgrade torch-npu to the official release version 2.7.1


- vLLM version: v0.11.0
- vLLM main:
83f478bb19

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-10-31 17:16:31 +08:00
committed by GitHub
parent 5f6d1b3323
commit fcc9a0eaeb
15 changed files with 83 additions and 168 deletions

View File

@@ -22,8 +22,6 @@ from vllm.config import get_current_vllm_config
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm
from vllm_ascend.utils import version_check
def _addrmsnorm_forward_oot(
self,
@@ -36,7 +34,6 @@ def _addrmsnorm_forward_oot(
from vllm_ascend.utils import is_310p
torch_npu_check = version_check()
if layer is not None and not is_310p():
layer_cls_name = layer.__class__.__name__
try:
@@ -53,23 +50,15 @@ def _addrmsnorm_forward_oot(
start_flag=x,
)
# add_rms_norm_quant
if torch_npu_check:
x, _, residual = torch_npu.npu_add_rms_norm_quant(
x,
residual,
self.weight,
layer.aclnn_input_scale,
layer.aclnn_input_offset,
beta=bias,
epsilon=self.variance_epsilon)
else:
x, _, residual = torch_npu.npu_add_rms_norm_quant(
x,
residual,
self.weight,
layer.aclnn_input_scale,
layer.aclnn_input_offset,
epsilon=self.variance_epsilon)
x, _, residual = torch_npu.npu_add_rms_norm_quant(
x,
residual,
self.weight,
layer.aclnn_input_scale,
layer.aclnn_input_offset,
beta=bias,
epsilon=self.variance_epsilon)
# prefetch qkvo_proj.weight postprocess
if weight_prefetch_method:
weight_prefetch_method.maybe_prefetch_attn_weight_postprocess(
@@ -87,7 +76,7 @@ def _addrmsnorm_forward_oot(
else:
x, _, residual = torch_npu.npu_add_rms_norm(
x, residual, self.weight, self.variance_epsilon)
if torch_npu_check and bias is not None:
if bias is not None:
x.add_(bias)
torch.ops.vllm.maybe_wait_prefetch_done(x)
return x, residual
@@ -106,9 +95,8 @@ class AscendRMSNorm(RMSNorm):
super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
vllm_config = get_current_vllm_config()
self.bias = None
self.torch_npu_check = version_check()
# quantization with anti_method m4 will generate none-zero norm bias
if self.torch_npu_check and vllm_config.quant_config is not None and \
if vllm_config.quant_config is not None and \
any("norm.bias" in name for name in vllm_config.quant_config.quant_description.keys()):
self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
requires_grad=False)
@@ -128,7 +116,7 @@ class AscendRMSNorm(RMSNorm):
return x, residual
x, residual = torch_npu.npu_rms_norm(x, self.weight,
self.variance_epsilon)
if self.torch_npu_check and self.bias is not None:
if self.bias is not None:
x.add_(self.bias)
return x