[v0.18.0][Bugfix] fix ds3.2 dcp mtp (#7681)

### What this PR does / why we need it?
Fixed the issue where the DCP overlaps the MTP scenario in the ds3.2
scenario.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

cherry-pick from: https://github.com/vllm-project/vllm-ascend/pull/7617

Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
weiguihua2
2026-03-27 14:24:53 +08:00
committed by GitHub
parent 048c8d1afe
commit bc8e87f3db
10 changed files with 18 additions and 7 deletions

View File

@@ -36,7 +36,7 @@ deployment:
--no-enable-prefix-caching
--gpu-memory-utilization 0.85
--trust-remote-code
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
@@ -62,7 +62,7 @@ deployment:
--no-enable-prefix-caching
--gpu-memory-utilization 0.85
--trust-remote-code
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32

View File

@@ -182,7 +182,7 @@ class TestAscendMLAMetadata(TestBase):
metadata = AscendMLAMetadata(
num_actual_tokens_pcp_padded, num_actual_tokens, slot_mapping,
query_start_loc, seq_lens, block_tables, num_decodes,
query_start_loc, seq_lens, seq_lens, block_tables, num_decodes,
num_decode_tokens, num_prefills, num_input_tokens, query_lens,
head_dim, attn_mask, attn_state, decode, prefill)

View File

@@ -58,6 +58,7 @@ class TestAscendSFAMetadata(TestBase):
num_actual_tokens=num_actual_tokens,
slot_mapping=slot_mapping,
seq_lens=seq_lens,
seq_lens_cpu=seq_lens,
cum_query_lens=cum_query_lens,
block_table=block_table,
sin=sin,

View File

@@ -803,6 +803,7 @@ class TestPCPDCPGraphParams(TestBase):
slot_mapping,
query_start_loc,
seq_lens,
seq_lens,
block_tables,
4,
4,