From dec04ec8d884a45f1946b72dea129bc686cc2f44 Mon Sep 17 00:00:00 2001 From: wanghuanjun2113 Date: Mon, 9 Mar 2026 16:14:51 +0800 Subject: [PATCH] [Bugfix] Fix incorrect layer count for MTP models in update_aclgraph_sizes (#7064) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Fix incorrect layer count calculation for MTP (Multi-Token Prediction) models in `update_aclgraph_sizes()` function - For MTP models, the draft model's layer count is stored in `num_nextn_predict_layers` or `mtp_num_hidden_layers` (for Qwen3.5), not in the standard `num_hidden_layers` field - Directly accessing `draft.hf_config.num_hidden_layers` returns the main model's layer count instead of the MTP draft model's layer count ## Bug Description In `vllm_ascend/utils.py`, the `update_aclgraph_sizes()` function calculates `resources_per_graph` for speculative decoding scenarios. When calculating the resources needed for the draft model, the original code directly accessed: ```python resources_per_graph += draft.hf_config.num_hidden_layers + 1 ``` This works correctly for standard draft models, but **fails for MTP models** (like DeepSeek-V3's MTP or Qwen3.5's MTP) because: 1. MTP models store their layer count in model-specific fields: - `num_nextn_predict_layers` (DeepSeek-V3 MTP) - `mtp_num_hidden_layers` (Qwen3.5 MTP) 2. The `num_hidden_layers` field in these models contains the **main model's** layer count, not the MTP layer count 3. This leads to **grossly overestimating** the `resources_per_graph`, which in turn causes the calculated `max_batch_sizes` to be unnecessarily small ## Fix Use `draft.get_total_num_hidden_layers()` instead of directly accessing `draft.hf_config.num_hidden_layers`. This method correctly handles different model types through the `model_arch_config_convertor` infrastructure, returning the appropriate layer count for: - Standard draft models → `num_hidden_layers` - DeepSeek-V3 MTP → `num_nextn_predict_layers` - Qwen3.5 MTP → `mtp_num_hidden_layers` 🤖 Generated with [Claude Code](https://claude.com/claude-code) - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d Signed-off-by: wanghuanjun2113 Co-authored-by: Claude Opus 4.6 --- vllm_ascend/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index aec8decf..b7d1bae1 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -485,7 +485,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: resources_per_graph = num_hidden_layers + 1 # For suffix decoding, use the suffix path when no draft_model_config is provided. if (spec := vllm_config.speculative_config) and (draft := spec.draft_model_config): - resources_per_graph += draft.hf_config.num_hidden_layers + 1 + # Use get_total_num_hidden_layers() to correctly handle MTP models, + # which store layer count in num_nextn_predict_layers or + # mtp_num_hidden_layers (for Qwen3.5) instead of num_hidden_layers. + resources_per_graph += draft.get_total_num_hidden_layers() + 1 # TODO: Find out whether we need to take into account the pp_size num_comm_groups = sum(