From f9535cc9e2076973b696684d71675bf0a237c1f9 Mon Sep 17 00:00:00 2001 From: elilzhu <2435754260@qq.com> Date: Thu, 16 Oct 2025 17:08:00 +0800 Subject: [PATCH] [BugFix] fix qwenVL quant assertion error (#3466) ### What this PR does / why we need it? This PR fixes issues: 1. Solve the problem that multimodal scene cannot do weight prefetching and throw an assertion error exception. 2. Standardize the grid_thw data type of qwen2VL to torch.int32. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? - ci & e2e - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: elilzhu <2435754260@qq.com> Co-authored-by: zhulei (AK) --- vllm_ascend/models/qwen2_vl.py | 1 + vllm_ascend/quantization/w8a8.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py index d9b3e03..b601b16 100644 --- a/vllm_ascend/models/qwen2_vl.py +++ b/vllm_ascend/models/qwen2_vl.py @@ -314,6 +314,7 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer): x: torch.Tensor, grid_thw: torch.Tensor, ) -> torch.Tensor: + grid_thw = torch.tensor(grid_thw, dtype=torch.int32) # compute cu_seqlens and avoid cumsum to fit operator unpadFA cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py index 5c7d986..fec542c 100644 --- a/vllm_ascend/quantization/w8a8.py +++ b/vllm_ascend/quantization/w8a8.py @@ -99,8 +99,11 @@ class AscendW8A8LinearMethod: ) -> torch.Tensor: if x.dtype != torch.int8: layer_cls_name = layer.__class__.__name__ - weight_prefetch_method = get_forward_context( - ).weight_prefetch_method + try: + weight_prefetch_method = get_forward_context( + ).weight_prefetch_method + except AssertionError: + weight_prefetch_method = None # prefetch qkvo_proj.weight preprocess if weight_prefetch_method: