[cherry-pick]fix qwen3vl mrope op (#4484) (#4811)

### What this PR does / why we need it?
Qwen2.5-VL mrope precision problem would been solved once this pr is
merged
### Does this PR introduce _any_ user-facing change? No
### How was this patch tested?
Test on G8600 with textVQA dataset

- vLLM version: v0.11.2
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: shaopeng-666 <lishaopeng21@huawei.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
zhangxinyuehfad
2025-12-09 11:07:32 +08:00
committed by GitHub
parent 9862a23985
commit 033e3557cc

View File

@@ -24,7 +24,6 @@ from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.rotary_embedding import (
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
YaRNScalingRotaryEmbedding)
from vllm.platforms import CpuArchEnum
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import enable_custom_op, is_310p
@@ -406,10 +405,7 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
query: torch.Tensor,
key: torch.Tensor,
):
# TODO: This judgment will be removed once the mrope precision issue is fixed
if self.mrope_section != [
16, 24, 24
] or NPUPlatform.get_cpu_architecture() == CpuArchEnum.X86:
if self.mrope_section != [16, 24, 24]:
return super().forward_oot(positions, query, key)
import torch_npu
@@ -424,7 +420,7 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
self.cos_sin_cache = self.cos_sin_cache.to( # type: ignore
query.dtype) # type: ignore
query, key = torch_npu.npu_mrope(positions,
query, key = torch_npu.npu_mrope(positions.contiguous(),
query.contiguous(),
key.contiguous(),
self.cos_sin_cache.contiguous(),