[Bugfix] eagle and eagle3 spec decode failures and enable e2e test (#2979)

### What this PR does / why we need it? - Fix the bug https://github.com/vllm-project/vllm-ascend/issues/2978 - Enable e2e test, - Adapt to scenarios where Speculative tokens are greater than 2, - Fix the bug that causes Eagle3 inference failures under high concurrency and improve the acceptance rate of draft models, by https://github.com/vllm-project/vllm-ascend/pull/2794 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with new added/existing test. Co-authored-by: hukongyi [hukongyi@cmbchina.com](mailto:hukongyi@cmbchina.com) Co-authored-by: guanyuzhu [zhuguanyu@huawei.com](mailto:zhuguanyu@huawei.com) Co-authored-by: liumail680 [liumail680@163.com](mailto:liumail680@163.com) - vLLM version: v0.10.2 - vLLM main: f225ea7dd9 --------- Signed-off-by: Icey <1790571317@qq.com>
2025-09-25 14:39:12 +08:00
parent ac1c2cd9ac
commit 2a9d02e080
2 changed files with 40 additions and 25 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations

+import os
 import random
 from typing import Any

@@ -9,6 +10,8 @@ from vllm import LLM, SamplingParams

 from tests.e2e.conftest import VllmRunner

+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+

@pytest.fixture
 def test_prompts():
@@ -99,7 +102,6 @@ def test_ngram_correctness(
    assert matches > int(0.7 * len(ref_outputs))


-@pytest.mark.skipif(True, reason="oom in CI, fix me")
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
 def test_eagle_correctness(
    test_prompts: list[list[dict[str, Any]]],
@@ -111,8 +113,6 @@ def test_eagle_correctness(
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using eagle speculative decoding.
    '''
-    if not use_eagle3:
-        pytest.skip("Not current support for the test.")

    ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
@@ -121,7 +121,6 @@ def test_eagle_correctness(
    spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
    with VllmRunner(
            model_name,
-            trust_remote_code=True,
            enable_chunked_prefill=True,
            max_num_seqs=1,
            max_num_batched_tokens=2048,