[Eagle3]enhance skipping dp allreduce and add it into eagle proposer (#6192)

### What this PR does / why we need it? This PR： 1. Enhances the logic of `_skip_all_reduce_across_dp_group` to skip all cpu dp allreduce for dense models. This is also for purpose 2. 2. Adds `_skip_all_reduce_across_dp_group` into eagle_proposer. Now models like Qwen3-235b supports eagle3 spec decode. A typical setting for these moe models on pd disaggregation often introduce `dp_size > 1`. This requires `set_forward_context` to call a cpu dp allreduce to retrieve `num_tokens_across_dp` on all cases. Skipping this allreduce greatly improves performance. - vLLM version: v0.14.0 - vLLM main: d68209402d --------- Signed-off-by: Angazenn <supperccell@163.com>
2026-01-24 11:29:42 +08:00
parent 56d8f088dd
commit 019a2fe6e6
3 changed files with 35 additions and 9 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -382,10 +382,17 @@ class EagleProposer(VllmEagleProposer):
        model_previous_hidden_states = self.hidden_states[:num_tokens]

        batch_size = num_tokens // (self.num_speculative_tokens + 1)
+        (
+            num_tokens,
+            num_tokens_across_dp,
+            _,
+        ) = self.runner._sync_metadata_across_dp(num_tokens,
+                                                 is_draft_model=True)
        with set_ascend_forward_context(
                multi_steps_attn_metadata[0] if multi_steps_attn_metadata else None,
                self.vllm_config,
                num_tokens=num_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
                num_actual_tokens=0,
                in_profile_run=is_profile,
                batch_descriptor=batch_descriptor,
@@ -531,10 +538,17 @@ class EagleProposer(VllmEagleProposer):
        self.last_token_indices[:last_token_indices_len].copy_(
            last_token_indices)

+        (
+            num_input_tokens,
+            num_tokens_across_dp,
+            _,
+        ) = self.runner._sync_metadata_across_dp(num_input_tokens,
+                                                 is_draft_model=True)
        with set_ascend_forward_context(
                multi_steps_attn_metadata[0],
                self.vllm_config,
                num_tokens=num_input_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
                num_actual_tokens=num_tokens,
                batch_descriptor=batch_descriptor,
                aclgraph_runtime_mode=aclgraph_runtime_mode,