From 71e9b379c84d4d68764fcda4ef833e21884f8341 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Tue, 2 Dec 2025 09:20:05 +0800 Subject: [PATCH] [Bugfix] Fix Qwen2.5-Omni-7B accuarcy test (#4556) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Fix Qwen2.5-Omni-7B accuarcy test issue:https://github.com/vllm-project/vllm-ascend/issues/4480 Depends on : https://github.com/vllm-project/vllm-ascend/pull/4534 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: hfadzxy --- vllm_ascend/ops/layernorm.py | 6 +++--- vllm_ascend/ops/register_custom_ops.py | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index da5051c0..8dad11c2 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -108,13 +108,13 @@ class AscendRMSNorm(RMSNorm): residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: import torch_npu - if residual is not None: residual = torch.ops.vllm.maybe_chunk_residual(x, residual) assert x.size(0) == residual.size(0) + next_need_quant_fusion_linear = getattr( + self, 'next_need_quant_fusion_linear', None) x, residual = _addrmsnorm_forward_oot( - self, x, residual, self.next_need_quant_fusion_linear, - self.bias) + self, x, residual, next_need_quant_fusion_linear, self.bias) return x, residual x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon) diff --git a/vllm_ascend/ops/register_custom_ops.py b/vllm_ascend/ops/register_custom_ops.py index 03bea554..7c7fd6f0 100644 --- a/vllm_ascend/ops/register_custom_ops.py +++ b/vllm_ascend/ops/register_custom_ops.py @@ -173,7 +173,9 @@ def _maybe_prefetch_mlp_down_proj_impl(x_dependency: torch.Tensor) -> None: except AssertionError: return - if not forward_context.prefetch_mlp_enabled: + prefetch_mlp_enabled = getattr(forward_context, 'prefetch_mlp_enabled', + False) + if not prefetch_mlp_enabled: return forward_context.prefetch_mlp_down_proj = True model_instance = forward_context.model_instance @@ -202,7 +204,9 @@ def _maybe_wait_prefetch_done_impl(x: torch.Tensor) -> None: except AssertionError: return - if not forward_context.prefetch_mlp_enabled: + prefetch_mlp_enabled = getattr(forward_context, 'prefetch_mlp_enabled', + False) + if not prefetch_mlp_enabled: return if forward_context.prefetch_mlp_gate_up_proj or \ forward_context.prefetch_mlp_down_proj: