diff --git a/tests/e2e/singlecard/test_async_scheduling.py b/tests/e2e/singlecard/test_async_scheduling.py index 3bfbd0c9..4f4eb05f 100644 --- a/tests/e2e/singlecard/test_async_scheduling.py +++ b/tests/e2e/singlecard/test_async_scheduling.py @@ -13,6 +13,7 @@ from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal MODEL = "Qwen/Qwen3-0.6B" +MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16" first_prompt = ("The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:") @@ -44,6 +45,27 @@ def test_without_spec_decoding(monkeypatch: pytest.MonkeyPatch, ): run_tests(monkeypatch, MODEL, test_configs, test_sampling_params) +def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): + """Test consistency and acceptance rates with some different combos of + preemption, executor, async scheduling, prefill chunking, + spec decoding model length. + """ + + spec_config = { + "method": "mtp", + "num_speculative_tokens": 2, + } + + # test_preemption, executor, async_scheduling, + # spec_config, test_prefill_chunking + test_configs = [ + (False, "mp", True, spec_config, False), + (False, "mp", False, spec_config, False), + ] + + run_tests(monkeypatch, MTP_MODEL, test_configs, [{}]) + + @dynamo_config.patch(cache_size_limit=16) def run_tests( monkeypatch: pytest.MonkeyPatch,