[Refactor] Adjustments to moe_comm_method selection process (#3001)

### What this PR does / why we need it? Fix issues mentioned in https://github.com/vllm-project/vllm-ascend/pull/2791 and some minor refactoring. 1. Use Enum instead of string. 2. Avoid setting a new property to forward_context in AscendFusedMoE.forward(). 3. Enabling TokenDispatcherWithMoge. 4. Remove redundant code. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Qwen3-30B-A3B/Qwen3-30B-A3B-W8A8/DeepSeek-V3-W4A8-Pruing/deepseek-mtp/pangu-pro-moe-pruing: 1. Enable/Disable EP 2. Aclgraph & eager - vLLM version: v0.10.2 - vLLM main: 9607d5eb44 Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
2025-09-22 19:12:58 +08:00
parent bb1f0d5a62
commit 37a0715eda
14 changed files with 170 additions and 351 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -117,11 +117,11 @@ class EagleProposer(Proposer):
                  skip_attn: bool = False,
                  num_reqs: int = 0,
                  num_tokens_across_dp: Optional[torch.Tensor] = None):
-        moe_comm_method = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(
            num_tokens, with_prefill)
        with set_ascend_forward_context(None,
                                        self.vllm_config,
-                                        moe_comm_method=moe_comm_method,
+                                        moe_comm_type=moe_comm_type,
                                        num_tokens=num_tokens):
            self.model(
                input_ids=self.input_ids[:num_tokens],
@@ -454,7 +454,7 @@ class EagleProposer(Proposer):
        with_prefill = attn_metadata.attn_state not in [
            AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
        ]
-        moe_comm_method = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(
            num_input_tokens, with_prefill)

        # copy inputs to buffer for cudagraph
@@ -463,7 +463,7 @@ class EagleProposer(Proposer):
        attn_metadata.block_tables = block_table.to(device)
        with set_ascend_forward_context(attn_metadata,
                                        self.vllm_config,
-                                        moe_comm_method=moe_comm_method,
+                                        moe_comm_type=moe_comm_type,
                                        num_tokens=num_input_tokens):
            last_hidden_states, hidden_states = self.model(
                input_ids=self.input_ids[:num_input_tokens],
@@ -495,7 +495,7 @@ class EagleProposer(Proposer):
        else:
            input_batch_size = batch_size

-        moe_comm_method = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(
            input_batch_size, False)

        attn_metadata.num_actual_tokens = batch_size
@@ -568,7 +568,7 @@ class EagleProposer(Proposer):
            # Run the model.
            with set_ascend_forward_context(attn_metadata,
                                            self.vllm_config,
-                                            moe_comm_method=moe_comm_method,
+                                            moe_comm_type=moe_comm_type,
                                            num_tokens=input_batch_size):

                last_hidden_states, hidden_states = self.model(