diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index e20ab2b9..db47ab21 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -412,6 +412,21 @@ class NPUPlatform(Platform): os.environ["PYTORCH_NPU_ALLOC_CONF"] = npu_alloc_configs logger.info("Set PYTORCH_NPU_ALLOC_CONF=%s", npu_alloc_configs) + # NOTE: vllm sets `speculative_config.enforce_eager` as True if using + # deepseek_v32 with mtp. Since we support graph mode, we simply ignore + # it here. However, this fix will also implicitly ignore user setting of + # `speculative_config.enforce_eager`, we need to take care and remove it + # once vllm supports this feature. + speculative_config = vllm_config.speculative_config + if ( + model_config + and speculative_config + and hasattr(model_config.hf_text_config, "model_type") + and model_config.hf_text_config.model_type == "deepseek_v32" + and speculative_config.enforce_eager + ): + speculative_config.enforce_eager = False + @classmethod def import_kernels(cls) -> None: # Directly importing vllm_ascend_C prevents ASCEND_RT_VISIBLE_DEVICES diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index eab2846d..a5ce1e57 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -246,7 +246,7 @@ class MtpProposer(EagleProposer): -1]: num_input_tokens = self.vllm_config.pad_for_cudagraph( num_scheduled_tokens) - elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[ + elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[ -1]: # Acl graph mode, add padding to the batch size num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)