From 1ab6cd49358e0ea91ae277703f054f1df0dce2e6 Mon Sep 17 00:00:00 2001 From: Zetong Li <48438720+slippersss@users.noreply.github.com> Date: Wed, 21 Jan 2026 09:24:33 +0800 Subject: [PATCH] [Bugfix] Fix setting of `speculative_config.enforce_eager` for dsv32 (#5945) ### What this PR does / why we need it? This PR aims to fix setting of `speculative_config.enforce_eager` in deepseek v3.2 mtp. The point is that, vllm sets `speculative_config.enforce_eager` as True if using deepseek_v32 with mtp. Since we support graph mode, we simply ignore it here. However, this fix will also implicitly ignore user setting of `speculative_config.enforce_eager`, we need to take care and remove it once vllm supports this feature. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: Zetong Li --- vllm_ascend/platform.py | 15 +++++++++++++++ vllm_ascend/spec_decode/mtp_proposer.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index e20ab2b9..db47ab21 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -412,6 +412,21 @@ class NPUPlatform(Platform): os.environ["PYTORCH_NPU_ALLOC_CONF"] = npu_alloc_configs logger.info("Set PYTORCH_NPU_ALLOC_CONF=%s", npu_alloc_configs) + # NOTE: vllm sets `speculative_config.enforce_eager` as True if using + # deepseek_v32 with mtp. Since we support graph mode, we simply ignore + # it here. However, this fix will also implicitly ignore user setting of + # `speculative_config.enforce_eager`, we need to take care and remove it + # once vllm supports this feature. + speculative_config = vllm_config.speculative_config + if ( + model_config + and speculative_config + and hasattr(model_config.hf_text_config, "model_type") + and model_config.hf_text_config.model_type == "deepseek_v32" + and speculative_config.enforce_eager + ): + speculative_config.enforce_eager = False + @classmethod def import_kernels(cls) -> None: # Directly importing vllm_ascend_C prevents ASCEND_RT_VISIBLE_DEVICES diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index eab2846d..a5ce1e57 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -246,7 +246,7 @@ class MtpProposer(EagleProposer): -1]: num_input_tokens = self.vllm_config.pad_for_cudagraph( num_scheduled_tokens) - elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[ + elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[ -1]: # Acl graph mode, add padding to the batch size num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)