[eagle3][pcp] fix bug for eagle3 and cp enable (#7309)

### What this PR does / why we need it? This PR fixes the bug for eagle3 and cp enable introduced by the parallel speculative inference PR. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? tests and ut - vLLM version: v0.17.0 - vLLM main: 4034c3d32e --------- Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2026-03-17 16:14:45 +08:00
parent 4e62a2ae15
commit 8f278fc101
2 changed files with 105 additions and 63 deletions
--- a/tests/e2e/multicard/4-cards/long_sequence/test_mtp.py
+++ b/tests/e2e/multicard/4-cards/long_sequence/test_mtp.py
@@ -29,6 +29,10 @@ prompts = [
    "The president of United States is", "AI future is"
 ]
 model = "wemaster/deepseek_mtp_main_random_bf16"
+model_eagle3 = {
+    "main": "Qwen/Qwen3-8B",
+    "spec": "RedHatAI/Qwen3-8B-speculator.eagle3",
+}

@wait_until_npu_memory_free()
 def test_pcp_dcp_mtp1_eager():
@@ -141,3 +145,24 @@ def test_dcp_mtp3_full_graph():
            async_scheduling=False,
    ) as runner:
        runner.generate_greedy(prompts, 32)
+
+
+@wait_until_npu_memory_free()
+def test_pcp_eagle3_eager():
+    with VllmRunner(
+            model_eagle3["main"],
+            max_model_len=1024,
+            tensor_parallel_size=2,
+            enforce_eager=True,
+            prefill_context_parallel_size=2,
+            decode_context_parallel_size=1,
+            max_num_batched_tokens=1024,
+            block_size=128,
+            speculative_config={
+                "num_speculative_tokens": 3,
+                "method": "eagle3",
+                "model": model_eagle3["spec"]
+            },
+            async_scheduling=False,
+    ) as runner:
+        runner.generate_greedy(prompts, 32)