[Main] [Refactor] Enable MoECommMethod in Eager Mode (#2791)
### What this PR does / why we need it?
1. Replace prepare/finalize operation in fused_moe.py by
moe_comm_method.prepare()/finalize()
2. Replace unified_fused_experts by moe_comm_method.fused_experts() in
fused_moe.py/w8a8_dynamic.py/w4a8_dynamic.py
3. Add calling _select_moe_comm_method in spec-decode proposers.
4. Currently, w4a8_dynamic does not support gatherep, use all2allv
instead.
5. Remove redundant code.
### Does this PR introduce _any_ user-facing change?
AllgatherEP switch is disabled in aclgraph/eager mode, just follow the
rules in modelrunner_v1._select_moe_comm_method()
### How was this patch tested?
e2e & ut
- vLLM version: v0.10.2
- vLLM main:
7f6f2c1182
Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
This commit is contained in:
@@ -42,17 +42,6 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool,
|
||||
return FusedMoEState.MC2
|
||||
|
||||
|
||||
def get_dispatcher_name(ep_size: int, with_prefill: bool) -> str:
|
||||
if ep_size == 1:
|
||||
return "TokenDispatcherWithAllGather"
|
||||
elif envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1:
|
||||
return "TokenDispatcherWithAllGather"
|
||||
elif ep_size < 16 or with_prefill:
|
||||
return "TokenDispatcherWithAll2AllV"
|
||||
else:
|
||||
return "TokenDispatcherWithMC2"
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_ascend_forward_context(
|
||||
attn_metadata: Any,
|
||||
@@ -97,11 +86,6 @@ def set_ascend_forward_context(
|
||||
forward_context.fused_moe_state = fused_moe_state
|
||||
forward_context.in_profile_run = in_profile_run
|
||||
|
||||
from vllm_ascend.ops.moe.token_dispatcher import get_token_dispatcher
|
||||
dispatcher_name = get_dispatcher_name(ep_size, with_prefill)
|
||||
dispatcher = get_token_dispatcher(dispatcher_name)
|
||||
forward_context.token_dispatcher = dispatcher
|
||||
|
||||
# NOTE: This cannot be set using set_forward_context
|
||||
# due to multiple warmups before actual capturing
|
||||
forward_context.capturing = False
|
||||
|
||||
Reference in New Issue
Block a user