From f9535cc9e2076973b696684d71675bf0a237c1f9 Mon Sep 17 00:00:00 2001
From: elilzhu <2435754260@qq.com>
Date: Thu, 16 Oct 2025 17:08:00 +0800
Subject: [PATCH] [BugFix] fix qwenVL quant assertion error (#3466)

### What this PR does / why we need it?
This PR fixes issues:
1. Solve the problem that multimodal scene cannot do weight prefetching
and throw an assertion error exception.
2. Standardize the grid_thw data type of qwen2VL to torch.int32.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
- ci & e2e

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: elilzhu <2435754260@qq.com>
Co-authored-by: zhulei (AK) <z00692222@china.huawei.com>
---
 vllm_ascend/models/qwen2_vl.py   | 1 +
 vllm_ascend/quantization/w8a8.py | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py
index d9b3e03..b601b16 100644
--- a/vllm_ascend/models/qwen2_vl.py
+++ b/vllm_ascend/models/qwen2_vl.py
@@ -314,6 +314,7 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer):
         x: torch.Tensor,
         grid_thw: torch.Tensor,
     ) -> torch.Tensor:
+        grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
         # compute cu_seqlens and avoid cumsum to fit operator unpadFA
         cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
                                              grid_thw[:,
diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py
index 5c7d986..fec542c 100644
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -99,8 +99,11 @@ class AscendW8A8LinearMethod:
     ) -> torch.Tensor:
         if x.dtype != torch.int8:
             layer_cls_name = layer.__class__.__name__
-            weight_prefetch_method = get_forward_context(
-            ).weight_prefetch_method
+            try:
+                weight_prefetch_method = get_forward_context(
+                ).weight_prefetch_method
+            except AssertionError:
+                weight_prefetch_method = None
 
             # prefetch qkvo_proj.weight preprocess
             if weight_prefetch_method: