[v0.18.0][Bugfix] fix ds3.2 dcp mtp (#7681)
### What this PR does / why we need it? Fixed the issue where the DCP overlaps the MTP scenario in the ds3.2 scenario. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? cherry-pick from: https://github.com/vllm-project/vllm-ascend/pull/7617 Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
@@ -36,7 +36,7 @@ deployment:
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.85
|
||||
--trust-remote-code
|
||||
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
|
||||
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
||||
--tokenizer-mode deepseek_v32
|
||||
@@ -62,7 +62,7 @@ deployment:
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.85
|
||||
--trust-remote-code
|
||||
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
|
||||
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
||||
--tokenizer-mode deepseek_v32
|
||||
|
||||
@@ -182,7 +182,7 @@ class TestAscendMLAMetadata(TestBase):
|
||||
|
||||
metadata = AscendMLAMetadata(
|
||||
num_actual_tokens_pcp_padded, num_actual_tokens, slot_mapping,
|
||||
query_start_loc, seq_lens, block_tables, num_decodes,
|
||||
query_start_loc, seq_lens, seq_lens, block_tables, num_decodes,
|
||||
num_decode_tokens, num_prefills, num_input_tokens, query_lens,
|
||||
head_dim, attn_mask, attn_state, decode, prefill)
|
||||
|
||||
|
||||
@@ -58,6 +58,7 @@ class TestAscendSFAMetadata(TestBase):
|
||||
num_actual_tokens=num_actual_tokens,
|
||||
slot_mapping=slot_mapping,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens,
|
||||
cum_query_lens=cum_query_lens,
|
||||
block_table=block_table,
|
||||
sin=sin,
|
||||
|
||||
@@ -803,6 +803,7 @@ class TestPCPDCPGraphParams(TestBase):
|
||||
slot_mapping,
|
||||
query_start_loc,
|
||||
seq_lens,
|
||||
seq_lens,
|
||||
block_tables,
|
||||
4,
|
||||
4,
|
||||
|
||||
Reference in New Issue
Block a user