Enable pytest and yaml style accuracy test (#2073)

### What this PR does / why we need it?

This PR enabled pytest and yaml style accuracy test, users now can
enable accuracy test by running:

```bash
cd ~/vllm-ascend
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
          --config ./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml \
          --report_output ./benchmarks/accuracy/Qwen3-8B-Base.md

pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
          --config-list-file ./tests/e2e/singlecard/models/configs/accuracy.txt
```

Closes: https://github.com/vllm-project/vllm-ascend/issues/1970

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?


- vLLM version: v0.10.0
- vLLM main:
2836dd73f1

---------

Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
Icey
2025-07-31 21:39:13 +08:00
committed by GitHub
parent 9c9a7cd90b
commit 86bdde1ca8
10 changed files with 336 additions and 446 deletions

View File

@@ -0,0 +1,8 @@
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
model: "vllm-vlm"
tasks:
- name: "mmmu_val"
metrics:
- name: "acc,none"
value: 0.51
max_model_len: 8192

View File

@@ -0,0 +1,18 @@
model_name: "Qwen/Qwen3-30B-A3B"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.89
- name: "exact_match,flexible-extract"
value: 0.85
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.84
num_fewshot: 5
gpu_memory_utilization: 0.6
enable_expert_parallel: True
tensor_parallel_size: 2
apply_chat_template: False
fewshot_as_multiturn: False

View File

@@ -0,0 +1,13 @@
model_name: "Qwen/Qwen3-8B-Base"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.82
- name: "exact_match,flexible-extract"
value: 0.83
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.82
num_fewshot: 5

View File

@@ -0,0 +1,3 @@
Qwen3-8B-Base.yaml
Qwen2.5-VL-7B-Instruct.yaml
Qwen3-30B-A3B.yaml

View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import pytest
def pytest_addoption(parser):
parser.addoption(
"--config-list-file",
action="store",
default=None,
help="Path to the file listing model config YAMLs (one per line)",
)
parser.addoption(
"--tp-size",
action="store",
default="1",
help="Tensor parallel size to use for evaluation",
)
parser.addoption(
"--config",
action="store",
default="./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml",
help="Path to the model config YAML file",
)
parser.addoption(
"--report_output",
action="store",
default="./benchmarks/accuracy/Qwen3-8B-Base.md",
help="Path to the report output file",
)
@pytest.fixture(scope="session")
def config_list_file(pytestconfig, config_dir):
rel_path = pytestconfig.getoption("--config-list-file")
return config_dir / rel_path
@pytest.fixture(scope="session")
def tp_size(pytestconfig):
return pytestconfig.getoption("--tp-size")
@pytest.fixture(scope="session")
def config(pytestconfig):
return pytestconfig.getoption("--config")
@pytest.fixture(scope="session")
def report_output(pytestconfig):
return pytestconfig.getoption("--report_output")
def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames:
# If config specified, use the --config directly
single_config = metafunc.config.getoption("--config")
if single_config:
metafunc.parametrize("config_filename",
[Path(single_config).resolve()])
return
# Otherwise, check --config-list-file
rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f:
configs = [
config_dir / line.strip() for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs)

View File

@@ -0,0 +1,24 @@
# {{ model_name }}
**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})),
**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }}
**Hardware Environment**: Atlas A2 Series
**Datasets**: {{ datasets }}
**Parallel Mode**: TP
**Execution Mode**: ACLGraph
**Command**:
```bash
export MODEL_ARGS={{ model_args }}
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
--apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \
--limit {{ limit }} --batch_size {{ batch_size}}
```
| Task | Metric | Value | Stderr |
|-----------------------|-------------|----------:|-------:|
{% for row in rows -%}
| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
{% endfor %}

View File

@@ -0,0 +1,148 @@
import os
from dataclasses import dataclass
import lm_eval
import numpy as np
import pytest
import yaml
from jinja2 import Environment, FileSystemLoader
RTOL = 0.03
TEST_DIR = os.path.dirname(__file__)
@dataclass
class EnvConfig:
vllm_version: str
vllm_commit: str
vllm_ascend_version: str
vllm_ascend_commit: str
cann_version: str
torch_version: str
torch_npu_version: str
@pytest.fixture
def env_config() -> EnvConfig:
return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
'unknown'),
vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
'unknown'),
cann_version=os.getenv('CANN_VERSION', 'unknown'),
torch_version=os.getenv('TORCH_VERSION', 'unknown'),
torch_npu_version=os.getenv('TORCH_NPU_VERSION',
'unknown'))
def build_model_args(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
model_args = {
"pretrained": eval_config["model_name"],
"tensor_parallel_size": tp_size,
"dtype": "auto",
"trust_remote_code": trust_remote_code,
"max_model_len": max_model_len,
}
for s in [
"max_images", "gpu_memory_utilization", "enable_expert_parallel",
"tensor_parallel_size"
]:
val = eval_config.get(s, None)
if val is not None:
model_args[s] = val
print("Model Parameters:")
print(model_args)
return model_args
def generate_report(tp_size, eval_config, report_data, report_output,
env_config):
env = Environment(loader=FileSystemLoader(TEST_DIR))
template = env.get_template("report_template.md")
model_args = build_model_args(eval_config, tp_size)
report_content = template.render(
vllm_version=env_config.vllm_version,
vllm_commit=env_config.vllm_commit,
vllm_ascend_version=env_config.vllm_ascend_version,
vllm_ascend_commit=env_config.vllm_ascend_commit,
cann_version=env_config.cann_version,
torch_version=env_config.torch_version,
torch_npu_version=env_config.torch_npu_version,
model_name=eval_config["model_name"],
model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
model_type=eval_config.get("model", "vllm"),
datasets=",".join([task["name"] for task in eval_config["tasks"]]),
apply_chat_template=eval_config.get("apply_chat_template", True),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
limit=eval_config.get("limit", None),
batch_size="auto",
num_fewshot=eval_config.get("num_fewshot", "N/A"),
rows=report_data["rows"])
os.makedirs(os.path.dirname(report_output), exist_ok=True)
with open(report_output, 'w', encoding='utf-8') as f:
f.write(report_content)
def test_lm_eval_correctness_param(config_filename, tp_size, report_output,
env_config):
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
model_args = build_model_args(eval_config, tp_size)
success = True
report_data: dict[str, list[dict]] = {"rows": []}
eval_params = {
"model": eval_config.get("model", "vllm"),
"model_args": model_args,
"tasks": [task["name"] for task in eval_config["tasks"]],
"apply_chat_template": eval_config.get("apply_chat_template", True),
"fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
"limit": eval_config.get("limit", None),
"batch_size": "auto",
}
for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
val = eval_config.get(s, None)
if val is not None:
eval_params[s] = val
print("Eval Parameters:")
print(eval_params)
results = lm_eval.simple_evaluate(**eval_params)
for task in eval_config["tasks"]:
task_name = task["name"]
task_result = results["results"][task_name]
for metric in task["metrics"]:
metric_name = metric["name"]
ground_truth = metric["value"]
measured_value = task_result[metric_name]
task_success = bool(
np.isclose(ground_truth, measured_value, rtol=RTOL))
success = success and task_success
print(f"{task_name} | {metric_name}: "
f"ground_truth={ground_truth} | measured={measured_value} | "
f"success={'' if task_success else ''}")
report_data["rows"].append({
"task":
task_name,
"metric":
metric_name,
"value":
f"{measured_value}" if success else f"{measured_value}",
"stderr":
task_result[
metric_name.replace(',', '_stderr,') if metric_name ==
"acc,none" else metric_name.replace(',', '_stderr,')]
})
generate_report(tp_size, eval_config, report_data, report_output,
env_config)
assert success