[bugfix]Enable dispatch_ffn_combine feature for qwen3.5 (#7066)

### What this PR does / why we need it? Qwen3.5 Moe supports enabling the dispatch_ffn_combine fusion operator. Fix problem: In the w8a8 quantization scene, Qwen3.5 model's config.json lacks the quantize field. The previous logic strictly relied on quant_type == "w8a8_dynamic" to enable VLLM_ASCEND_ENABLE_FUSED_MC2. This caused the dispatch_ffn_combine fusion operator to fail to activate even when the environment variable was set. Enable dispatch_ffn_combine fusion operator for BF16 scenarios. - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: asunxiao <asunxiao@qq.com>
2026-03-17 19:53:02 +08:00
parent 83ad14c74c
commit a370dfa962
2 changed files with 46 additions and 6 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -245,14 +245,18 @@ def select_moe_comm_method(num_tokens: int, vllm_config: VllmConfig, is_draft_mo
    elif soc_version in {AscendDeviceType.A3}:
        # TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
        # TODO: drop speculative method guard when dispatch_gmm_combine_decode supports w16a16
-        fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 and quant_type == "w8a8_dynamic"
+        fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2
        dispatch_ffn_combine_enable = get_ep_group().world_size <= 32 and (not is_draft_model)
        if num_tokens <= mc2_tokens_capacity:
            fused_decode_enable = fused_mc2_enable
            if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
                fused_decode_enable = fused_mc2_enable and dispatch_ffn_combine_enable
            elif envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 2:
-                fused_decode_enable = fused_mc2_enable and speculative_enable_dispatch_gmm_combine_decode(vllm_config)
+                fused_decode_enable = (
+                    fused_mc2_enable
+                    and speculative_enable_dispatch_gmm_combine_decode(vllm_config)
+                    and quant_type == "w8a8_dynamic"
+                )
            moe_comm_type = MoECommType.FUSED_MC2 if fused_decode_enable else MoECommType.MC2
        else:
            fused_prefill_enable = fused_mc2_enable