xc-llm-ascend/tests/e2e/singlecard/test_eager_mode_acc.py

#
# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
This file test accuracy via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""

import lm_eval
import pytest

MODEL_NAMES = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.414, "vllm-ascend/DeepSeek-V2-Lite-W8A8": 0.34}


def run_test(model_name, more_args=None):
    """Run the end to end accuracy test."""

    # NOTE: Do not add any spaces to the string below, as this will cause parameter parsing errors.
    model_args = f"pretrained={model_name},max_model_len=4096,enforce_eager=True"

    if more_args is not None:
        model_args = "{},{}".format(model_args, more_args)

    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks="gsm8k",
        batch_size="auto",
    )

    measured_value = results["results"][TASK][FILTER]
    assert model_name in EXPECTED_VALUES, f"Cannot find the expected value for the model {model_name=}"
    expected_value = EXPECTED_VALUES[model_name]
    assert measured_value - RTOL < expected_value and measured_value + RTOL > expected_value, (
        f"Expected: {expected_value} |  Measured: {measured_value}"
    )


@pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy(model):
    """Run with the V1 Engine."""
    more_args = None
    run_test(model, more_args)