From 06ec136f087abf3f4d9e114ba16611f7d8753ee7 Mon Sep 17 00:00:00 2001 From: Zetong Li <48438720+slippersss@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:05:01 +0800 Subject: [PATCH] [Bugfix] Obtain kernel block size for computing slot mapping correctly (#7019) ### What this PR does / why we need it? This PR aims to fix incorrect slot mapping in qwen35 due to mismatched block size. In qwen35, we should use `kernel_block_size` so that we can compute it in a correct way, and it is obtained in `load_model` when we have a chance to grab `draft_attn_layers`. - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 Signed-off-by: Zetong Li --- vllm_ascend/spec_decode/eagle_proposer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 3086ae30..9a498a28 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -161,13 +161,19 @@ class AscendEagleProposer(EagleProposer): ) indexer_layers = get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys() - draft_attn_layer = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase).keys() + draft_attn_layers_dict = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase) + draft_attn_layers = draft_attn_layers_dict.keys() - draft_attn_layer_names = draft_attn_layer - target_attn_layer_names + draft_attn_layer_names = draft_attn_layers - target_attn_layer_names draft_indexer_layer_names = indexer_layers - target_indexer_layer_names draft_attn_layer_names = draft_attn_layer_names - draft_indexer_layer_names assert len(draft_attn_layer_names) == 1 self.attn_layer_names = list(sorted(draft_attn_layer_names)) + + self.kernel_block_size = ( + draft_attn_layers_dict[self.attn_layer_names[0]].get_attn_backend().get_supported_kernel_block_sizes()[0] + ) + self.piece_all_attn_layer_name = [] for _ in range(self.num_speculative_tokens): self.piece_all_attn_layer_name.append([name for name in self.attn_layer_names]) @@ -978,7 +984,10 @@ class AscendEagleProposer(EagleProposer): slot_mapping = mtp_slot_mapping[slot_indices] common_attn_metadata.slot_mapping[: batch_size * self.pcp_size] = slot_mapping else: - block_size = attn_metadata_builder.kv_cache_spec.block_size + # NOTE: In vllm, `block_size = attn_metadata_builder.kv_cache_spec.block_size`. + # However, in vllm-ascend, the above value can be multiple of `kernel_block_size`, + # which is not correct for computing `slot_mapping` below. + block_size = self.kernel_block_size # Compute the slot mapping. if self.uses_mrope: