[main] [refactor] refactor fused_moe.py to enable token_dispatchers (#2570)

### What this PR does / why we need it? Enable token_dispatcher to replace fused_experts_with_xxx in eager mode ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? e2e & ut - vLLM version: v0.10.1.1 - vLLM main: 704432af3c Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: sherie <963372609@qq.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com> Co-authored-by: shiyuan680 <72335504+shiyuan680@users.noreply.github.com>
2025-08-28 10:13:35 +08:00
parent 936c102105
commit 320edde2df
10 changed files with 1066 additions and 1639 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -46,6 +46,18 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool,
        return FusedMoEState.MC2


+def get_dispatcher_name(ep_size: int, with_prefill: bool) -> str:
+    if ep_size == 1:
+        return "TokenDispatcherWithAllGather"
+
+    if ep_size < 16:
+        return "TokenDispatcherWithAll2AllV"
+
+    if with_prefill:
+        return "TokenDispatcherWithAll2AllV"
+    return "TokenDispatcherWithMC2"
+
+
@contextmanager
 def set_ascend_forward_context(
        attn_metadata: Any,
@@ -87,6 +99,14 @@ def set_ascend_forward_context(
        forward_context.fused_moe_state = fused_moe_state
        forward_context.in_profile_run = in_profile_run

+        with_quant = vllm_config.quant_config is not None
+        forward_context.with_quant = with_quant
+        from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
+            get_token_dispatcher
+        dispatcher_name = get_dispatcher_name(ep_size, with_prefill)
+        dispatcher = get_token_dispatcher(dispatcher_name)
+        forward_context.token_dispatcher = dispatcher
+
        # NOTE: This cannot be set using set_forward_context
        # due to multiple warmups before actual capturing
        forward_context.capturing = False