[Bugfix] Fix Qwen2.5-Omni-7B accuarcy test (#4556)

### What this PR does / why we need it?
Fix Qwen2.5-Omni-7B accuarcy test
issue:https://github.com/vllm-project/vllm-ascend/issues/4480
Depends on : https://github.com/vllm-project/vllm-ascend/pull/4534

- vLLM version: v0.11.2
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2025-12-02 09:20:05 +08:00
committed by GitHub
parent b4bf01ead1
commit 71e9b379c8
2 changed files with 9 additions and 5 deletions

View File

@@ -108,13 +108,13 @@ class AscendRMSNorm(RMSNorm):
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
import torch_npu
if residual is not None:
residual = torch.ops.vllm.maybe_chunk_residual(x, residual)
assert x.size(0) == residual.size(0)
next_need_quant_fusion_linear = getattr(
self, 'next_need_quant_fusion_linear', None)
x, residual = _addrmsnorm_forward_oot(
self, x, residual, self.next_need_quant_fusion_linear,
self.bias)
self, x, residual, next_need_quant_fusion_linear, self.bias)
return x, residual
x, residual = torch_npu.npu_rms_norm(x, self.weight,
self.variance_epsilon)

View File

@@ -173,7 +173,9 @@ def _maybe_prefetch_mlp_down_proj_impl(x_dependency: torch.Tensor) -> None:
except AssertionError:
return
if not forward_context.prefetch_mlp_enabled:
prefetch_mlp_enabled = getattr(forward_context, 'prefetch_mlp_enabled',
False)
if not prefetch_mlp_enabled:
return
forward_context.prefetch_mlp_down_proj = True
model_instance = forward_context.model_instance
@@ -202,7 +204,9 @@ def _maybe_wait_prefetch_done_impl(x: torch.Tensor) -> None:
except AssertionError:
return
if not forward_context.prefetch_mlp_enabled:
prefetch_mlp_enabled = getattr(forward_context, 'prefetch_mlp_enabled',
False)
if not prefetch_mlp_enabled:
return
if forward_context.prefetch_mlp_gate_up_proj or \
forward_context.prefetch_mlp_down_proj: