[BugFix]Disable dispatch_gmm_combine_decode operator when mtp drafter model uses non-w8a8 while main model uses w8a8, or drafter model is eagle series (#5293)
…w8a8 while main model uses w8a8
### What this PR does / why we need it?
Disable dispatch_gmm_combine_decode operator when mtp drafter model uses
non-w8a8 while main model uses w8a8, or drafter model is eagle series.
More info about this operator, please refer to RFC: issue
https://github.com/vllm-project/vllm-ascend/issues/5476
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangqiankun <wangqiankun13@huawei.com>
This commit is contained in:
@@ -831,6 +831,23 @@ def is_moe_model(vllm_config: VllmConfig):
|
||||
return _IS_MOE_MODEL
|
||||
|
||||
|
||||
def speculative_enable_dispatch_gmm_combine_decode(
|
||||
vllm_config: VllmConfig) -> bool:
|
||||
if vllm_config.speculative_config is None:
|
||||
return True
|
||||
speculative_method = getattr(vllm_config.speculative_config, "method",
|
||||
None)
|
||||
if speculative_method in [None, "ngram", "suffix"]:
|
||||
return True
|
||||
if speculative_method in ["eagle", "eagle3"]:
|
||||
return False
|
||||
if speculative_method == "mtp":
|
||||
mtp_quant_type = getattr(vllm_config.model_config.hf_config,
|
||||
"mtp_quantize", None)
|
||||
return mtp_quant_type == "w8a8_dynamic"
|
||||
return False
|
||||
|
||||
|
||||
def _is_contain_expert(config: Any):
|
||||
if isinstance(config, dict):
|
||||
for k, v in config.items():
|
||||
|
||||
Reference in New Issue
Block a user