[Feat]Make full graph mode compalible with MTP (#3276)

### What this PR does / why we need it? Make the Full Graph mode can run with MTP. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-10-17 20:19:56 +08:00
parent 46e62efd44
commit 248ee7fa11
7 changed files with 103 additions and 44 deletions
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -361,9 +361,10 @@ class NPUWorker(WorkerBase):

    def execute_dummy_batch(self) -> None:
        force_attention = self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY
-        self.model_runner._dummy_run(num_tokens=1,
-                                     uniform_decode=True,
-                                     force_attention=force_attention)
+        self.model_runner._dummy_run(
+            num_tokens=self.model_runner.decode_token_per_req,
+            uniform_decode=True,
+            force_attention=force_attention)

    def _init_worker_distributed_environment(self) -> None:
        """Initialize the distributed environment."""