[Refactor] Adjustments to moe_comm_method selection process (#3001)

### What this PR does / why we need it?
Fix issues mentioned in
https://github.com/vllm-project/vllm-ascend/pull/2791 and some minor
refactoring.
1. Use Enum instead of string.
2. Avoid setting a new property to forward_context in
AscendFusedMoE.forward().
3. Enabling TokenDispatcherWithMoge.
4. Remove redundant code.

### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?

Qwen3-30B-A3B/Qwen3-30B-A3B-W8A8/DeepSeek-V3-W4A8-Pruing/deepseek-mtp/pangu-pro-moe-pruing:
1. Enable/Disable EP
2. Aclgraph & eager


- vLLM version: v0.10.2
- vLLM main:
9607d5eb44

Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
This commit is contained in:
weichen
2025-09-22 19:12:58 +08:00
committed by GitHub
parent bb1f0d5a62
commit 37a0715eda
14 changed files with 170 additions and 351 deletions

View File

@@ -117,11 +117,11 @@ class EagleProposer(Proposer):
skip_attn: bool = False,
num_reqs: int = 0,
num_tokens_across_dp: Optional[torch.Tensor] = None):
moe_comm_method = self.runner._select_moe_comm_method(
moe_comm_type = self.runner._select_moe_comm_method(
num_tokens, with_prefill)
with set_ascend_forward_context(None,
self.vllm_config,
moe_comm_method=moe_comm_method,
moe_comm_type=moe_comm_type,
num_tokens=num_tokens):
self.model(
input_ids=self.input_ids[:num_tokens],
@@ -454,7 +454,7 @@ class EagleProposer(Proposer):
with_prefill = attn_metadata.attn_state not in [
AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
]
moe_comm_method = self.runner._select_moe_comm_method(
moe_comm_type = self.runner._select_moe_comm_method(
num_input_tokens, with_prefill)
# copy inputs to buffer for cudagraph
@@ -463,7 +463,7 @@ class EagleProposer(Proposer):
attn_metadata.block_tables = block_table.to(device)
with set_ascend_forward_context(attn_metadata,
self.vllm_config,
moe_comm_method=moe_comm_method,
moe_comm_type=moe_comm_type,
num_tokens=num_input_tokens):
last_hidden_states, hidden_states = self.model(
input_ids=self.input_ids[:num_input_tokens],
@@ -495,7 +495,7 @@ class EagleProposer(Proposer):
else:
input_batch_size = batch_size
moe_comm_method = self.runner._select_moe_comm_method(
moe_comm_type = self.runner._select_moe_comm_method(
input_batch_size, False)
attn_metadata.num_actual_tokens = batch_size
@@ -568,7 +568,7 @@ class EagleProposer(Proposer):
# Run the model.
with set_ascend_forward_context(attn_metadata,
self.vllm_config,
moe_comm_method=moe_comm_method,
moe_comm_type=moe_comm_type,
num_tokens=input_batch_size):
last_hidden_states, hidden_states = self.model(