From 305820f1a982ed9597932778891b5da64ecccae9 Mon Sep 17 00:00:00 2001 From: jiangmengyu18 <56633611+jiangmengyu18@users.noreply.github.com> Date: Wed, 18 Mar 2026 20:30:03 +0800 Subject: [PATCH] [Bugfix] fix bug about model type of qwen3_vl_8b_instruct_w8a8 (#7383) ### What this PR does / why we need it? Adapt to the model type of Qwen3-VL-8B-Instruct-W8A8 - vLLM version: v0.17.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d --------- Signed-off-by: betta18 Co-authored-by: betta18 --- .github/workflows/misc/model_list.json | 1 + .github/workflows/schedule_nightly_test_a2.yaml | 1 + .../models/configs/Qwen3-VL-8B-Instruct-W8A8.yaml | 12 ++++++++++++ vllm_ascend/quantization/modelslim_config.py | 2 +- 4 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 tests/e2e/models/configs/Qwen3-VL-8B-Instruct-W8A8.yaml diff --git a/.github/workflows/misc/model_list.json b/.github/workflows/misc/model_list.json index 1bfd8fce..562c8601 100644 --- a/.github/workflows/misc/model_list.json +++ b/.github/workflows/misc/model_list.json @@ -222,6 +222,7 @@ "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8-Pruning", "vllm-ascend/Qwen3-Omni-30B-A3B-Thinking", "vllm-ascend/Qwen3-VL-8B-Instruct", + "vllm-ascend/Qwen3-VL-8B-Instruct-W8A8", "vllm-ascend/TinyLlama-1.1B-Chat-v0.3", "vllm-ascend/benchmark", "vllm-ascend/ilama-3.2-1B", diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index d15b3f23..f79d9496 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -250,6 +250,7 @@ jobs: - name: accuracy-group-1 os: linux-aarch64-a2b3-1 model_list: + - Qwen3-VL-8B-Instruct-W8A8 - Qwen3-8B - Qwen2-Audio-7B-Instruct - Qwen3-8B-W8A8 diff --git a/tests/e2e/models/configs/Qwen3-VL-8B-Instruct-W8A8.yaml b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct-W8A8.yaml new file mode 100644 index 00000000..9535bd33 --- /dev/null +++ b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct-W8A8.yaml @@ -0,0 +1,12 @@ +model_name: "vllm-ascend/Qwen3-VL-8B-Instruct-W8A8" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: +- name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.52 +max_model_len: 8192 +batch_size: 32 +gpu_memory_utilization: 0.8 +quantization: ascend \ No newline at end of file diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 82b3d279..e5361dd1 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -57,7 +57,7 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, - "qwen3_vl_text": { + "qwen3_vl": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.",