From 08d7014874f4e9d3aa97b5c1ac45d919bd304139 Mon Sep 17 00:00:00 2001 From: wangqiankun13 Date: Thu, 22 Jan 2026 10:51:02 +0800 Subject: [PATCH] [Feature]Enable DispatchGmmCombineDecode when eagle is moe with w8a8 or not moe [RFC: issue 5476] (#5758) ### What this PR does / why we need it? Operator `DispatchGmmCombineDecode` does not support non-W8A8 scenarios and cannot share the same communication domain with Operator `Dispatch`/`Combine`. > for instance, when the draft model uses a non-W8A8 MOE architecture while the main model employs a W8A8 MOE architecture. Therefore days ago, I implemented an interception that unconditionally disables Operator `DispatchGmmCombineDecode` whenever the speculative mode is `EAGLE` or `EAGLE-3`. [PR: 5293](https://github.com/vllm-project/vllm-ascend/pull/5293) However, this approach was not precise enough. This PR further refines the logic by specifically identifying the draft model's configuration: Operator `DispatchGmmCombineDecode` will now be disabled only when the draft model uses an MOE architecture and is non-W8A8. More info about this operator, please refer to RFC: issue https://github.com/vllm-project/vllm-ascend/issues/5476 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Acc test qwen3-235b eplb on a single A3 node(ep16), with dispatch_gmm_combine_decode ```shell nic_name="xxxx" local_ip="xxx.xxx.xxx.xxx" export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export VLLM_ASCEND_ENABLE_FUSED_MC2=2 echo "VLLM_ASCEND_ENABLE_FUSED_MC2=${VLLM_ASCEND_ENABLE_FUSED_MC2}" export HCCL_OP_EXPANSION_MODE="AIV" export HCCL_BUFFSIZE=512 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD vllm serve /dataset/Qwen3-235B-A22B-Instruct-2507-w8a8-QuaRot/ \ --served-model-name "qwen" \ --host 0.0.0.0 \ --port 8004 \ --async-scheduling \ --tensor-parallel-size 4 \ --data-parallel-size 4 \ --max-num-seqs 64 \ --max-model-len 40960 \ --max-num-batched-tokens 16384 \ --gpu-memory-utilization 0.9 \ --enable-expert-parallel \ --no-enable-prefix-caching \ --quantization "ascend" \ --trust-remote-code \ --speculative_config \ '{ "method": "eagle3", "model": "/dataset/Qwen3-235B-A22B-Instruct-2507-speculator-eagle3/", "num_speculative_tokens": 2 }' \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ 2>&1 | tee qwen3_235b_eagle3.log ``` | dataset | version | metric | mode | vllm-api-stream-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 80.00 | - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d Signed-off-by: wangqiankun --- vllm_ascend/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 86e05841..238c5c0f 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -869,13 +869,22 @@ def is_drafter_moe_model(vllm_config: VllmConfig): def speculative_enable_dispatch_gmm_combine_decode(vllm_config: VllmConfig) -> bool: + """When draft contains MOE Arch and non-w8a8, disable dispatch_gmm_combine_decode.""" if vllm_config.speculative_config is None: return True speculative_method = getattr(vllm_config.speculative_config, "method", None) if speculative_method in [None, "ngram", "suffix"]: return True if speculative_method in ["eagle", "eagle3"]: - return False + if is_drafter_moe_model(vllm_config): + draft_model_config = vllm_config.speculative_config.draft_model_config + hf_text_config = draft_model_config.hf_text_config + quant_type = getattr(hf_text_config, "moe_quantize", None) + if quant_type is None: + quant_type = getattr(hf_text_config, "quantize", None) + return quant_type == "w8a8_dynamic" + else: + return True if speculative_method == "mtp": mtp_quant_type = getattr(vllm_config.model_config.hf_text_config, "mtp_quantize", None) return mtp_quant_type == "w8a8_dynamic"