From f2dd5f8d087a293c2c99293d303a8376739c38f4 Mon Sep 17 00:00:00 2001 From: NeverRaR <44917563+NeverRaR@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:52:27 +0800 Subject: [PATCH] fix : support chunked_prefill with deepseek_mtp (#2711) ### What this PR does / why we need it? fix : support chunked_prefill with deepseek_mtp ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ``` vllm serve $MODEL_PATH --quantization ascend \ --served-model-name auto \ --trust-remote-code \ --distributed-executor-backend=mp \ --port 8006 \ -tp=8 \ -dp=2 \ --no-enforce-eager \ --max-num-seqs 24 \ --max-model-len 32768 \ --max-num-batched-tokens 16384 \ --block-size 128 \ --no-enable-prefix-caching \ --disable-log-requests \ --speculative-config '{"num_speculative_tokens":1, "method": "deepseek_mtp"}' \ --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24],"enable_multistream_mla": true},"ascend_scheduler_config":{"enabled":false},"expert_tensor_parallel_size":16, "chunked_prefill_for_mla":true}' \ --gpu-memory-utilization 0.95 ``` - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: boying <897013703@qq.com> --- vllm_ascend/torchair/torchair_mla.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/torchair/torchair_mla.py b/vllm_ascend/torchair/torchair_mla.py index 57179c9f..98409cb9 100644 --- a/vllm_ascend/torchair/torchair_mla.py +++ b/vllm_ascend/torchair/torchair_mla.py @@ -426,8 +426,8 @@ class AscendMLATorchairMetadataBuilder: if num_prefills > 0: reqs_start = num_decodes # prefill_start tokens_start = num_decode_tokens - max_query_len = query_lens[tokens_start:].max().item() - max_seq_lens = seq_lens[tokens_start:].max().item() + max_query_len = query_lens[reqs_start:].max().item() + max_seq_lens = seq_lens[reqs_start:].max().item() prefill_query_start_loc = query_start_loc[ reqs_start:] - query_start_loc[reqs_start] @@ -473,9 +473,9 @@ class AscendMLATorchairMetadataBuilder: 1).unsqueeze(2) prefill_metadata = AscendMLATorchairPrefillMetadata( attn_mask=common_attn_metadata.attn_mask, - query_lens=query_lens[tokens_start:].to(torch.int32), + query_lens=query_lens[reqs_start:].to(torch.int32), seq_lens=seq_lens, - context_lens=seq_lens[tokens_start:], + context_lens=seq_lens[reqs_start:], input_positions=prefill_input_positions, block_table=block_table[reqs_start:, ...], max_query_len=max_query_len,