[BugFix]Disable dispatch_gmm_combine_decode operator when mtp drafter model uses non-w8a8 while main model uses w8a8, or drafter model is eagle series (#5293)

…w8a8 while main model uses w8a8 ### What this PR does / why we need it? Disable dispatch_gmm_combine_decode operator when mtp drafter model uses non-w8a8 while main model uses w8a8, or drafter model is eagle series. More info about this operator, please refer to RFC: issue https://github.com/vllm-project/vllm-ascend/issues/5476 - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c Signed-off-by: wangqiankun <wangqiankun13@huawei.com>
2026-01-04 17:51:28 +08:00
parent f15dc3fa02
commit 350b95efcf
2 changed files with 23 additions and 2 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -831,6 +831,23 @@ def is_moe_model(vllm_config: VllmConfig):
    return _IS_MOE_MODEL


+def speculative_enable_dispatch_gmm_combine_decode(
+        vllm_config: VllmConfig) -> bool:
+    if vllm_config.speculative_config is None:
+        return True
+    speculative_method = getattr(vllm_config.speculative_config, "method",
+                                 None)
+    if speculative_method in [None, "ngram", "suffix"]:
+        return True
+    if speculative_method in ["eagle", "eagle3"]:
+        return False
+    if speculative_method == "mtp":
+        mtp_quant_type = getattr(vllm_config.model_config.hf_config,
+                                 "mtp_quantize", None)
+        return mtp_quant_type == "w8a8_dynamic"
+    return False
+
+
 def _is_contain_expert(config: Any):
    if isinstance(config, dict):
        for k, v in config.items():