[Feat]Make full graph mode compalible with MTP (#3276)

### What this PR does / why we need it? Make the Full Graph mode can run with MTP. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-10-17 20:19:56 +08:00
parent 46e62efd44
commit 248ee7fa11
7 changed files with 103 additions and 44 deletions
--- a/tests/ut/worker/test_worker_v1.py
+++ b/tests/ut/worker/test_worker_v1.py
@@ -448,6 +448,7 @@ class TestNPUWorker(TestBase):
            worker.compilation_config = MagicMock()
            worker.compilation_config.cudagraph_mode = MagicMock()
            mock_model_runner = MagicMock()
+            mock_decode_token_per_req = mock_model_runner.decode_token_per_req
            worker.model_runner = mock_model_runner

            # Test execute_dummy_batch
@@ -455,7 +456,9 @@ class TestNPUWorker(TestBase):

            # Verify call
            mock_model_runner._dummy_run.assert_called_once_with(
-                num_tokens=1, uniform_decode=True, force_attention=False)
+                num_tokens=mock_decode_token_per_req,
+                uniform_decode=True,
+                force_attention=False)

    @patch("vllm_ascend.worker.worker_v1.envs_vllm")
    @patch("vllm_ascend.worker.worker_v1.logger")