[Main] [Refactor] Enable MoECommMethod in Eager Mode (#2791)

### What this PR does / why we need it? 1. Replace prepare/finalize operation in fused_moe.py by moe_comm_method.prepare()/finalize() 2. Replace unified_fused_experts by moe_comm_method.fused_experts() in fused_moe.py/w8a8_dynamic.py/w4a8_dynamic.py 3. Add calling _select_moe_comm_method in spec-decode proposers. 4. Currently, w4a8_dynamic does not support gatherep, use all2allv instead. 5. Remove redundant code. ### Does this PR introduce _any_ user-facing change? AllgatherEP switch is disabled in aclgraph/eager mode, just follow the rules in modelrunner_v1._select_moe_comm_method() ### How was this patch tested? e2e & ut - vLLM version: v0.10.2 - vLLM main: 7f6f2c1182 Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
2025-09-16 11:06:00 +08:00
parent 0aba644633
commit 18ca7861f6
18 changed files with 523 additions and 596 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -42,17 +42,6 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool,
        return FusedMoEState.MC2


-def get_dispatcher_name(ep_size: int, with_prefill: bool) -> str:
-    if ep_size == 1:
-        return "TokenDispatcherWithAllGather"
-    elif envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1:
-        return "TokenDispatcherWithAllGather"
-    elif ep_size < 16 or with_prefill:
-        return "TokenDispatcherWithAll2AllV"
-    else:
-        return "TokenDispatcherWithMC2"
-
-
@contextmanager
 def set_ascend_forward_context(
        attn_metadata: Any,
@@ -97,11 +86,6 @@ def set_ascend_forward_context(
        forward_context.fused_moe_state = fused_moe_state
        forward_context.in_profile_run = in_profile_run

-        from vllm_ascend.ops.moe.token_dispatcher import get_token_dispatcher
-        dispatcher_name = get_dispatcher_name(ep_size, with_prefill)
-        dispatcher = get_token_dispatcher(dispatcher_name)
-        forward_context.token_dispatcher = dispatcher
-
        # NOTE: This cannot be set using set_forward_context
        # due to multiple warmups before actual capturing
        forward_context.capturing = False