xc-llm-ascend/tests/e2e/multicard/test_deepseek_v2_lite_tp2_accuracy.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm-project/blob/main/tests/entrypoints/llm/test_accuracy.py
#

import gc
import multiprocessing
from multiprocessing import Queue

import lm_eval
import pytest
import torch

# pre-trained model path on Hugging Face.
MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
# Math reasoning benchmark (Grade School Math 8K).
TASK = "gsm8k"
# Answer validation requiring format consistency.
FILTER = "exact_match,strict-match"
# 3% relative tolerance for numerical accuracy.
RTOL = 0.03
# Baseline accuracy after VLLM optimization.
EXPECTED_VALUE = 0.3843821076573162


def run_test(model_name, queue, more_args=None):
    model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True"
    if more_args is not None:
        model_args = f"{model_args},{more_args}"
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks=TASK,
        batch_size="auto",
    )
    result = results["results"][TASK][FILTER]
    print(100 * "*", "\nThe accuracy test result:", result)
    queue.put(result)
    del results
    torch.npu.empty_cache()
    gc.collect()


@pytest.mark.parametrize("model", MODELS)
def test_lm_eval_accuracy(model, monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context():
        result_queue: Queue[float] = multiprocessing.Queue()
        p = multiprocessing.Process(target=run_test,
                                    args=(
                                        model,
                                        result_queue,
                                    ))
        p.start()
        p.join()
        result = result_queue.get()
        assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL), \
            f"Expected: {EXPECTED_VALUE}±{RTOL} | Measured: {result}"
[Bugfix] Fix deepseek percision issue and add acc ci for it (#905) ### What this PR does / why we need it? Fix deepseek percision issue on V0 and add acc ci for it Fixes https://github.com/vllm-project/vllm-ascend/issues/1062 ### How was this patch tested? CI passed with new added test. Signed-off-by: MengqingCao <cmq0113@163.com> 2025-06-04 20:26:44 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# Copyright 2023 The vLLM team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# This file is a part of the vllm-ascend project.`
			`# Adapted from vllm-project/blob/main/tests/entrypoints/llm/test_accuracy.py`
			`#`

			`import gc`
			`import multiprocessing`
			`from multiprocessing import Queue`

			`import lm_eval`
			`import pytest`
			`import torch`

			`# pre-trained model path on Hugging Face.`
			`MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]`
			`# Math reasoning benchmark (Grade School Math 8K).`
			`TASK = "gsm8k"`
			`# Answer validation requiring format consistency.`
			`FILTER = "exact_match,strict-match"`
			`# 3% relative tolerance for numerical accuracy.`
			`RTOL = 0.03`
			`# Baseline accuracy after VLLM optimization.`
[Build] Move numba/quart to requirments and update DS baseline and sync graph typo fix (#1121) ### What this PR does / why we need it? 1. The dependency was introduced by https://github.com/vllm-project/vllm-ascend/pull/874 - Move numba/quart from requirements-dev to requirments - Align pyproject.toml with requirements 2. This patch also fix deepseek accuracy baseline which https://github.com/vllm-project/vllm-ascend/pull/1118 was not addressed. According to https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite the gsm8k is about `41.1` 3. This also sync the vLLM upstream changes: https://github.com/vllm-project/vllm/commit/eaa2e51088d4daf36d47e566ad90e812f80e91b8 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed vllm ascend test (basic workflow) vllm longterm test (spec decode) Closes: https://github.com/vllm-project/vllm-ascend/issues/1120 --------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com> 2025-06-08 22:33:37 +08:00			`EXPECTED_VALUE = 0.3843821076573162`
[Bugfix] Fix deepseek percision issue and add acc ci for it (#905) ### What this PR does / why we need it? Fix deepseek percision issue on V0 and add acc ci for it Fixes https://github.com/vllm-project/vllm-ascend/issues/1062 ### How was this patch tested? CI passed with new added test. Signed-off-by: MengqingCao <cmq0113@163.com> 2025-06-04 20:26:44 +08:00

			`def run_test(model_name, queue, more_args=None):`
[V1][eagle3] Support eagle3 proposer for v1 (#1032) ### What this PR does / why we need it? This PR implements the Eagle Pososer feature for vLLM v1, which enables more efficient speculative decoding by using a draft model to predict potential future tokens. - The implementation includes the core Eagle algorithm integration with vLLM's existing architecture, allowing for faster inference while maintaining output quality. - This is needed to significantly improve the generation speed of large language models without compromising on the quality of generated text. ### Does this PR introduce any user-facing change? Yes, this PR introduces a new speculative decoding mode that can be enabled via configuration. - Users can now choose to use Eagle Pososer by setting appropriate flags in the inference configuration. - The API remains backward compatible, with the new functionality being opt-in. ### How was this patch tested? CI passed with new unit tests added for the Eagle Pososer functionality. - Benchmark tests were conducted comparing generation speed and quality with and without Eagle Pososer. - Integration tests were performed with various model architectures to ensure compatibility. - Manual testing was done using different prompt scenarios to verify output quality remains consistent. - we test accept rate on one Ascend 910B npu, The acceptance rate results are basically consistent with those shown here: https://github.com/vllm-project/vllm/pull/16937 - Currently, we support scenarios where num_spec_tokens <= 2. When num_spec_tokens > 2, issues such as insufficient GPU memory and operator computation errors may occur. We will address this in subsequent updates. - We will add support for Eagle v1 in future updates. ### Acceptance Test Script ```bash SCRIPT="/offline/eagle.py" DATASET="ShareGpt" MODEL=Meta-Llama-3.1-8B-Instruct DRAFT=EAGLE3-LLaMA3.1-Instruct-8B CUDA_VISIBLE_DEVICES="0" VLLM_USE_V1=1 $PYTHON $SCRIPT \ --dataset $DATASET \ --num_spec_tokens 2 \ --max_num_seqs 1 \ --model_dir $MODEL \ --eagle_dir $DRAFT \ --tp 1 \ --num_prompts 80 ``` ### Acceptance Test Results ```bash ██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 80/80 [21:22<00:00, 16.03s/it, est. speed input: 4.72 toks/s, output: 13.56 toks/s] ------------------------------------------------------------------------------------- mean acceptance length: 1.63 ------------------------------------------------------------------------------------- total_counts: 8062 acceptance at token 0: 1.00 (8062 times) acceptance at token 1: 0.70 (5612 times) acceptance at token 2: 0.47 (3765 times) ``` Closes: https://github.com/vllm-project/vllm-ascend/issues/1004 --------- Signed-off-by: yuancaoyaoHW <a2749322671@gmail.com> 2025-06-20 17:19:54 +08:00			`model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True"`
[Bugfix] Fix deepseek percision issue and add acc ci for it (#905) ### What this PR does / why we need it? Fix deepseek percision issue on V0 and add acc ci for it Fixes https://github.com/vllm-project/vllm-ascend/issues/1062 ### How was this patch tested? CI passed with new added test. Signed-off-by: MengqingCao <cmq0113@163.com> 2025-06-04 20:26:44 +08:00			`if more_args is not None:`
			`model_args = f"{model_args},{more_args}"`
			`results = lm_eval.simple_evaluate(`
			`model="vllm",`
			`model_args=model_args,`
			`tasks=TASK,`
			`batch_size="auto",`
			`)`
			`result = results["results"][TASK][FILTER]`
			`print(100 * "*", "\nThe accuracy test result:", result)`
			`queue.put(result)`
			`del results`
			`torch.npu.empty_cache()`
			`gc.collect()`


			`@pytest.mark.parametrize("model", MODELS)`
			`def test_lm_eval_accuracy(model, monkeypatch: pytest.MonkeyPatch):`
			`with monkeypatch.context():`
			`result_queue: Queue[float] = multiprocessing.Queue()`
			`p = multiprocessing.Process(target=run_test,`
			`args=(`
			`model,`
			`result_queue,`
			`))`
			`p.start()`
			`p.join()`
			`result = result_queue.get()`
			`assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL), \`
			`f"Expected: {EXPECTED_VALUE}±{RTOL} \| Measured: {result}"`