[EPLB][bugfix] Bugfix for fused mc2 (#6794)

### What this PR does / why we need it? This pull request addresses a bug related to the fused mc2 functionality within the EPLB (Expert Parallelism Load Balancing) system, specifically impacting quantization and MoE communication. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 Signed-off-by: Spicy-Stick <873805887@qq.com> Signed-off-by: root <root@localhost.localdomain>
2026-03-09 11:26:57 +08:00
parent 06ec136f08
commit 23bf5d4d48
5 changed files with 50 additions and 28 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -9,7 +9,6 @@ from vllm.distributed import get_dp_group, get_ep_group, get_tensor_model_parall
 from vllm.forward_context import BatchDescriptor, get_forward_context, set_forward_context

 import vllm_ascend.envs as envs_ascend
-from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import (
    AscendDeviceType,
    enable_sp,
@@ -243,11 +242,10 @@ def select_moe_comm_method(num_tokens: int, vllm_config: VllmConfig, is_draft_mo
            moe_comm_type = MoECommType.ALLGATHER

    elif soc_version in {AscendDeviceType.A3}:
-        dynamic_eplb = get_ascend_config().eplb_config.dynamic_eplb
        # TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
        # TODO: drop speculative method guard when dispatch_gmm_combine_decode supports w16a16
        fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 and quant_type == "w8a8_dynamic"
-        dispatch_ffn_combine_enable = get_ep_group().world_size <= 32 and (not is_draft_model) and (not dynamic_eplb)
+        dispatch_ffn_combine_enable = get_ep_group().world_size <= 32 and (not is_draft_model)
        if num_tokens <= mc2_tokens_capacity:
            fused_decode_enable = fused_mc2_enable
            if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1: