[bugfix] Use FUSED_MC2 MoE comm path for the op dispatch_ffn_combine (#5156)
### What this PR does / why we need it?
- Renames the MoE comm enum value `MoECommType.FUSED_ALLTOALL` to
`MoECommType.FUSED_MC2` and updates all call sites.
- Updates `select_moe_comm_method` to optionally select `FUSED_MC2` on
Ascend A3 when:
- `enable_expert_parallel=True`
- quantization is `w8a8_dynamic`
- `EP <= 16`
- `dynamic_eplb` is disabled
- `is_mtp_model = False`
- Replaces the old “fused all-to-all” comm implementation with
`FusedMC2CommImpl`, using `TokenDispatcherWithMC2` /
`PrepareAndFinalizeWithMC2` and `dispatch_ffn_combine`.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: Chen Chen <0109chenchen@gmail.com>
This commit is contained in:
@@ -430,7 +430,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# moe_comm_method of each rank is MC2 and recomputation would never happen in D
|
||||
# nodes. So here we check whether recompute_scheduler_enable is True.
|
||||
return self.is_kv_consumer and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method(
|
||||
potential_max_num_tokens, self.vllm_config) == MoECommType.MC2
|
||||
potential_max_num_tokens,
|
||||
self.vllm_config) in {MoECommType.MC2, MoECommType.FUSED_MC2}
|
||||
|
||||
def _sync_metadata_across_dp(
|
||||
self, num_tokens: int,
|
||||
@@ -1058,7 +1059,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# (num_reqs_d + num_reqs_p, max_num_blocks),
|
||||
# flattened block_table: [d0, d0, d1, d1, p0, p1, p2]
|
||||
# (num_reqs_d * decode_threshold + num_reqs_p, max_num_blocks),
|
||||
ori_query_lens = self.query_start_loc_pcp_full.cpu[1:num_reqs+1] - \
|
||||
ori_query_lens = self.query_start_loc_pcp_full.cpu[1:num_reqs + 1] - \
|
||||
self.query_start_loc_pcp_full.cpu[:num_reqs]
|
||||
num_prefill_reqs = (ori_query_lens
|
||||
> self.decode_threshold).sum().item()
|
||||
@@ -2203,7 +2204,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
def profile_run(self) -> None:
|
||||
mc2_tokens_capacity = get_mc2_tokens_capacity()
|
||||
if self.max_num_tokens > mc2_tokens_capacity and \
|
||||
select_moe_comm_method(mc2_tokens_capacity, self.vllm_config) == MoECommType.MC2:
|
||||
select_moe_comm_method(mc2_tokens_capacity, self.vllm_config) in {MoECommType.MC2, MoECommType.FUSED_MC2}:
|
||||
self._dummy_run(mc2_tokens_capacity,
|
||||
with_prefill=True,
|
||||
is_profile=True)
|
||||
|
||||
Reference in New Issue
Block a user