[CI/UT][Refactor] move e2e spec decode and deepseek acc test to per pr (#1136)

### What this PR does / why we need it? 1. run deepseek acc ut per pr --- multicard CI time increased by 9 min 2. run spec decode e2e test on v1 per pr --- singlecard CI time increased by 3 min (partly is disabled due to not work now) ~~3. align the output of whether dbo is enabled or not~~ The generated results with and without dbo cannot be aligned. https://github.com/vllm-project/vllm-ascend/actions/runs/15822900528/job/44600029405?pr=1136 4. skip V0 mtp test due to failure in https://github.com/vllm-project/vllm-ascend/actions/runs/16012172833/job/45171988816 5. fix some version conflicts ### How was this patch tested? CI passed with new added test. --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-07-04 18:05:45 +08:00
parent 343955c7ac
commit dd22ac38b2
7 changed files with 12 additions and 26 deletions
--- a/tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/long_term/spec_decode_v1/test_v1_mtp_correctness.py
@@ -1,92 +0,0 @@
-from __future__ import annotations
-
-import random
-from typing import Any
-
-import pytest
-from vllm import LLM, SamplingParams
-
-
-@pytest.fixture
-def test_prompts():
-    prompt_types = ["repeat", "sentence"]
-    num_prompts = 10
-    prompts = []
-
-    random.seed(0)
-    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
-
-    # Generate a mixed batch of prompts, some of which can be easily
-    # predicted by n-gram matching and some which likely cannot.
-    for kind in random_prompt_type_choices:
-        word_choices = ["test", "temp", "hello", "where"]
-        word = random.choice(word_choices)
-        if kind == "repeat":
-            prompt = f"""
-            please repeat the word '{word}' 10 times.
-            give no other output than the word at least ten times in a row,
-            in lowercase with spaces between each word and without quotes.
-            """
-        elif kind == "sentence":
-            prompt = f"""
-            please give a ten-word sentence that
-            uses the word {word} at least once.
-            give no other output than that simple sentence without quotes.
-            """
-        else:
-            raise ValueError(f"Unknown prompt type: {kind}")
-        prompts.append([{"role": "user", "content": prompt}])
-
-    return prompts
-
-
-@pytest.fixture
-def sampling_config():
-    return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False)
-
-
-@pytest.fixture
-def model_name():
-    return "wemaster/deepseek_mtp_main_random_bf16"
-
-
-def test_mtp_correctness(
-    monkeypatch: pytest.MonkeyPatch,
-    test_prompts: list[list[dict[str, Any]]],
-    sampling_config: SamplingParams,
-    model_name: str,
-):
-    '''
-    Compare the outputs of a original LLM and a speculative LLM
-    should be the same when using mtp speculative decoding.
-    '''
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
-
-        spec_llm = LLM(model=model_name,
-                       trust_remote_code=True,
-                       speculative_config={
-                           "method": "deepseek_mtp",
-                           "num_speculative_tokens": 1,
-                       },
-                       max_model_len=256,
-                       enforce_eager=True)
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
-
-        # Heuristic: expect at least 66% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
-        del spec_llm
--- a/tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/long_term/spec_decode_v1/test_v1_spec_decode.py
@@ -1,161 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-import random
-from typing import Any
-
-import pytest
-from vllm import LLM, SamplingParams
-
-
-@pytest.fixture
-def test_prompts():
-    prompt_types = ["repeat", "sentence"]
-    num_prompts = 10
-    prompts = []
-
-    random.seed(0)
-    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
-
-    # Generate a mixed batch of prompts, some of which can be easily
-    # predicted by n-gram matching and some which likely cannot.
-    for kind in random_prompt_type_choices:
-        word_choices = ["test", "temp", "hello", "where"]
-        word = random.choice(word_choices)
-        if kind == "repeat":
-            prompt = f"""
-            please repeat the word '{word}' 10 times.
-            give no other output than the word at least ten times in a row,
-            in lowercase with spaces between each word and without quotes.
-            """
-        elif kind == "sentence":
-            prompt = f"""
-            please give a ten-word sentence that
-            uses the word {word} at least once.
-            give no other output than that simple sentence without quotes.
-            """
-        else:
-            raise ValueError(f"Unknown prompt type: {kind}")
-        prompts.append([{"role": "user", "content": prompt}])
-
-    return prompts
-
-
-@pytest.fixture
-def sampling_config():
-    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
-
-
-@pytest.fixture
-def model_name():
-    return "LLM-Research/Meta-Llama-3.1-8B-Instruct"
-
-
-def eagle_model_name():
-    return "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"
-
-
-def eagle3_model_name():
-    return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"
-
-
-def test_ngram_correctness(
-    monkeypatch: pytest.MonkeyPatch,
-    test_prompts: list[list[dict[str, Any]]],
-    sampling_config: SamplingParams,
-    model_name: str,
-):
-    '''
-    Compare the outputs of a original LLM and a speculative LLM
-    should be the same when using ngram speculative decoding.
-    '''
-    pytest.skip("Not current support for the test.")
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
-
-        spec_llm = LLM(
-            model=model_name,
-            speculative_config={
-                "method": "ngram",
-                "prompt_lookup_max": 5,
-                "prompt_lookup_min": 3,
-                "num_speculative_tokens": 3,
-            },
-            max_model_len=1024,
-            enforce_eager=True,
-        )
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
-
-        # Heuristic: expect at least 70% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.7 * len(ref_outputs))
-        del spec_llm
-
-
-@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
-def test_eagle_correctness(
-    monkeypatch: pytest.MonkeyPatch,
-    test_prompts: list[list[dict[str, Any]]],
-    sampling_config: SamplingParams,
-    model_name: str,
-    use_eagle3: bool,
-):
-    '''
-    Compare the outputs of a original LLM and a speculative LLM
-    should be the same when using eagle speculative decoding.
-    '''
-    if not use_eagle3:
-        pytest.skip("Not current support for the test.")
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
-
-        spec_model_name = eagle3_model_name(
-        ) if use_eagle3 else eagle_model_name()
-        spec_llm = LLM(
-            model=model_name,
-            trust_remote_code=True,
-            enable_chunked_prefill=True,
-            max_num_seqs=1,
-            max_num_batched_tokens=2048,
-            gpu_memory_utilization=0.6,
-            speculative_config={
-                "method": "eagle3" if use_eagle3 else "eagle",
-                "model": spec_model_name,
-                "num_speculative_tokens": 2,
-                "max_model_len": 128,
-            },
-            max_model_len=128,
-            enforce_eager=True,
-        )
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
-
-        # Heuristic: expect at least 66% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
-        del spec_llm
--- a/tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
+++ b/tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
@@ -1,71 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm-project/blob/main/tests/entrypoints/llm/test_accuracy.py
-#
-
-import gc
-import multiprocessing
-from multiprocessing import Queue
-
-import lm_eval
-import pytest
-import torch
-
-# pre-trained model path on Hugging Face.
-MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
-# Math reasoning benchmark (Grade School Math 8K).
-TASK = "gsm8k"
-# Answer validation requiring format consistency.
-FILTER = "exact_match,strict-match"
-# 3% relative tolerance for numerical accuracy.
-RTOL = 0.03
-# Baseline accuracy after VLLM optimization.
-EXPECTED_VALUE = 0.3843821076573162
-
-
-def run_test(model_name, queue, more_args=None):
-    model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True"
-    if more_args is not None:
-        model_args = f"{model_args},{more_args}"
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=model_args,
-        tasks=TASK,
-        batch_size="auto",
-    )
-    result = results["results"][TASK][FILTER]
-    print(100 * "*", "\nThe accuracy test result:", result)
-    queue.put(result)
-    del results
-    torch.npu.empty_cache()
-    gc.collect()
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_lm_eval_accuracy(model, monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context():
-        result_queue: Queue[float] = multiprocessing.Queue()
-        p = multiprocessing.Process(target=run_test,
-                                    args=(
-                                        model,
-                                        result_queue,
-                                    ))
-        p.start()
-        p.join()
-        result = result_queue.get()
-        assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL), \
-            f"Expected: {EXPECTED_VALUE}±{RTOL} | Measured: {result}"