From 1a7a34c5ec59f9f5de063a43ee35469356b7ac2e Mon Sep 17 00:00:00 2001 From: Ronald Date: Wed, 10 Dec 2025 11:30:22 +0800 Subject: [PATCH] add e2e test for mtp async_scheduling (#4826) ### What this PR does / why we need it? add e2e test for mtp async scheduling ### Does this PR introduce _any_ user-facing change? no - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: Ronald1995 --- tests/e2e/singlecard/test_async_scheduling.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/e2e/singlecard/test_async_scheduling.py b/tests/e2e/singlecard/test_async_scheduling.py index 3bfbd0c9..4f4eb05f 100644 --- a/tests/e2e/singlecard/test_async_scheduling.py +++ b/tests/e2e/singlecard/test_async_scheduling.py @@ -13,6 +13,7 @@ from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal MODEL = "Qwen/Qwen3-0.6B" +MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16" first_prompt = ("The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:") @@ -44,6 +45,27 @@ def test_without_spec_decoding(monkeypatch: pytest.MonkeyPatch, ): run_tests(monkeypatch, MODEL, test_configs, test_sampling_params) +def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): + """Test consistency and acceptance rates with some different combos of + preemption, executor, async scheduling, prefill chunking, + spec decoding model length. + """ + + spec_config = { + "method": "mtp", + "num_speculative_tokens": 2, + } + + # test_preemption, executor, async_scheduling, + # spec_config, test_prefill_chunking + test_configs = [ + (False, "mp", True, spec_config, False), + (False, "mp", False, spec_config, False), + ] + + run_tests(monkeypatch, MTP_MODEL, test_configs, [{}]) + + @dynamo_config.patch(cache_size_limit=16) def run_tests( monkeypatch: pytest.MonkeyPatch,