From 9216e1b0505c7e290d8c02cc64cb8817bfdd49f5 Mon Sep 17 00:00:00 2001
From: xmpp777 <yangming2@huawei.com>
Date: Tue, 10 Mar 2026 09:09:31 +0800
Subject: [PATCH] [fix] Add support for Qwen3.5 Dense and MoE on Ascend (#6933)

### What this PR does / why we need it?

This pull request introduces support for the Qwen3.5 MoE model on Ascend
devices. The key changes are:

* **Quantization Configuration for Qwen3.5 MoE**: Adds necessary prefix
mappings and packed module definitions for `qwen3_5_moe` in
`vllm_ascend/quantization/modelslim_config.py` to enable ModelSlim
quantization.
* **Triton Kernel Fix**: Corrects a bug in the `fused_gdn_gating` Triton
kernel. The calculation for `BLK_BATCHES` had an operator precedence
issue which is now resolved. The calculation has also been made more
robust with added clamping to prevent potential out-of-bounds memory
access in the unified buffer.

These changes enable the correct and efficient execution of Qwen3.5 MoE
models on Ascend hardware.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

CI should be used to verify the correctness of these changes. It is
recommended to run tests with the Qwen3.5 MoE model to ensure the new
configurations and the kernel fix work as expected.

Signed-off-by: xmpp777 <yangming2@huawei.com>
---
 vllm_ascend/ops/triton/fused_gdn_gating.py   |  9 +++++---
 vllm_ascend/quantization/modelslim_config.py | 23 ++++++++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/ops/triton/fused_gdn_gating.py b/vllm_ascend/ops/triton/fused_gdn_gating.py
index b3b05706..9c3ea9d1 100644
--- a/vllm_ascend/ops/triton/fused_gdn_gating.py
+++ b/vllm_ascend/ops/triton/fused_gdn_gating.py
@@ -71,6 +71,8 @@ def fused_gdn_gating_patch(
     BLK_HEADS = 8
     COL_ITER = triton.cdiv(num_heads, BLK_HEADS)
 
+    elem_size = a.element_size()
+    max_ub_batches = int((UNIFIED_BUFFER_SIZE * 0.95) / (BLK_HEADS * elem_size))
     if batch <= num_cores:
         progs = batch
         BLK_BATCHES = 1
@@ -78,10 +80,11 @@ def fused_gdn_gating_patch(
     else:
         progs = num_cores
         FACTOR = 8 * num_heads
-        row_per_core = triton.cdiv(batch, num_cores)
-        BLK_BATCHES = (
-            triton.next_power_of_2(triton.cdiv(UNIFIED_BUFFER_SIZE, FACTOR * BLK_HEADS) // a.element_size()) // 2
+        calc_blk_batches = (
+            triton.next_power_of_2(triton.cdiv(int(UNIFIED_BUFFER_SIZE * 0.95), FACTOR * BLK_HEADS * elem_size)) // 2
         )
+        BLK_BATCHES = max(1, min(calc_blk_batches, max_ub_batches, 64))
+        row_per_core = triton.cdiv(batch, progs)
         ROW_ITER = triton.cdiv(row_per_core, BLK_BATCHES)
 
     g = torch.empty(1, batch, num_heads, dtype=torch.float32, device=a.device)
diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py
index 3e3b308a..c769bfa0 100644
--- a/vllm_ascend/quantization/modelslim_config.py
+++ b/vllm_ascend/quantization/modelslim_config.py
@@ -87,6 +87,16 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
         "language_model.lm_head.": "lm_head.",
         "language_model.model.": "model.language_model.",
     },
+    "qwen3_5": {
+        "visual.": "model.visual.",
+        "language_model.lm_head.": "lm_head.",
+        "language_model.model.": "model.language_model.",
+    },
+    "qwen3_5_moe": {
+        "visual.": "model.visual.",
+        "language_model.lm_head.": "lm_head.",
+        "language_model.model.": "model.language_model.",
+    },
 }
 
 # key: model_type
@@ -104,6 +114,19 @@ packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
         ],
         "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
     },
+    "qwen3_5": {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
+        "in_proj_ba": ["in_proj_b", "in_proj_a"],
+    },
+    "qwen3_5_moe": {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
+        "in_proj_ba": ["in_proj_b", "in_proj_a"],
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+    },
     "deepseek_v2": {
         "gate_up_proj": ["gate_proj", "up_proj"],
         "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],