Enable pytest and yaml style accuracy test (#2073)
### What this PR does / why we need it?
This PR enabled pytest and yaml style accuracy test, users now can
enable accuracy test by running:
```bash
cd ~/vllm-ascend
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
--config ./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml \
--report_output ./benchmarks/accuracy/Qwen3-8B-Base.md
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
--config-list-file ./tests/e2e/singlecard/models/configs/accuracy.txt
```
Closes: https://github.com/vllm-project/vllm-ascend/issues/1970
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.10.0
- vLLM main:
2836dd73f1
---------
Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
model: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "mmmu_val"
|
||||
metrics:
|
||||
- name: "acc,none"
|
||||
value: 0.51
|
||||
max_model_len: 8192
|
||||
18
tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml
Normal file
18
tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
model_name: "Qwen/Qwen3-30B-A3B"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.89
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.85
|
||||
- name: "ceval-valid"
|
||||
metrics:
|
||||
- name: "acc,none"
|
||||
value: 0.84
|
||||
num_fewshot: 5
|
||||
gpu_memory_utilization: 0.6
|
||||
enable_expert_parallel: True
|
||||
tensor_parallel_size: 2
|
||||
apply_chat_template: False
|
||||
fewshot_as_multiturn: False
|
||||
13
tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml
Normal file
13
tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
model_name: "Qwen/Qwen3-8B-Base"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.82
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.83
|
||||
- name: "ceval-valid"
|
||||
metrics:
|
||||
- name: "acc,none"
|
||||
value: 0.82
|
||||
num_fewshot: 5
|
||||
3
tests/e2e/singlecard/models/configs/accuracy.txt
Normal file
3
tests/e2e/singlecard/models/configs/accuracy.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
Qwen3-8B-Base.yaml
|
||||
Qwen2.5-VL-7B-Instruct.yaml
|
||||
Qwen3-30B-A3B.yaml
|
||||
73
tests/e2e/singlecard/models/conftest.py
Normal file
73
tests/e2e/singlecard/models/conftest.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--config-list-file",
|
||||
action="store",
|
||||
default=None,
|
||||
help="Path to the file listing model config YAMLs (one per line)",
|
||||
)
|
||||
parser.addoption(
|
||||
"--tp-size",
|
||||
action="store",
|
||||
default="1",
|
||||
help="Tensor parallel size to use for evaluation",
|
||||
)
|
||||
parser.addoption(
|
||||
"--config",
|
||||
action="store",
|
||||
default="./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml",
|
||||
help="Path to the model config YAML file",
|
||||
)
|
||||
parser.addoption(
|
||||
"--report_output",
|
||||
action="store",
|
||||
default="./benchmarks/accuracy/Qwen3-8B-Base.md",
|
||||
help="Path to the report output file",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def config_list_file(pytestconfig, config_dir):
|
||||
rel_path = pytestconfig.getoption("--config-list-file")
|
||||
return config_dir / rel_path
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tp_size(pytestconfig):
|
||||
return pytestconfig.getoption("--tp-size")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def config(pytestconfig):
|
||||
return pytestconfig.getoption("--config")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def report_output(pytestconfig):
|
||||
return pytestconfig.getoption("--report_output")
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc):
|
||||
if "config_filename" in metafunc.fixturenames:
|
||||
# If config specified, use the --config directly
|
||||
single_config = metafunc.config.getoption("--config")
|
||||
if single_config:
|
||||
metafunc.parametrize("config_filename",
|
||||
[Path(single_config).resolve()])
|
||||
return
|
||||
# Otherwise, check --config-list-file
|
||||
rel_path = metafunc.config.getoption("--config-list-file")
|
||||
config_list_file = Path(rel_path).resolve()
|
||||
config_dir = config_list_file.parent
|
||||
with open(config_list_file, encoding="utf-8") as f:
|
||||
configs = [
|
||||
config_dir / line.strip() for line in f
|
||||
if line.strip() and not line.startswith("#")
|
||||
]
|
||||
metafunc.parametrize("config_filename", configs)
|
||||
24
tests/e2e/singlecard/models/report_template.md
Normal file
24
tests/e2e/singlecard/models/report_template.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# {{ model_name }}
|
||||
|
||||
**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})),
|
||||
**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
|
||||
**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }}
|
||||
**Hardware Environment**: Atlas A2 Series
|
||||
**Datasets**: {{ datasets }}
|
||||
**Parallel Mode**: TP
|
||||
**Execution Mode**: ACLGraph
|
||||
|
||||
**Command**:
|
||||
|
||||
```bash
|
||||
export MODEL_ARGS={{ model_args }}
|
||||
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
|
||||
--apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \
|
||||
--limit {{ limit }} --batch_size {{ batch_size}}
|
||||
```
|
||||
|
||||
| Task | Metric | Value | Stderr |
|
||||
|-----------------------|-------------|----------:|-------:|
|
||||
{% for row in rows -%}
|
||||
| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
|
||||
{% endfor %}
|
||||
148
tests/e2e/singlecard/models/test_lm_eval_correctness.py
Normal file
148
tests/e2e/singlecard/models/test_lm_eval_correctness.py
Normal file
@@ -0,0 +1,148 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import pytest
|
||||
import yaml
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
RTOL = 0.03
|
||||
TEST_DIR = os.path.dirname(__file__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EnvConfig:
|
||||
vllm_version: str
|
||||
vllm_commit: str
|
||||
vllm_ascend_version: str
|
||||
vllm_ascend_commit: str
|
||||
cann_version: str
|
||||
torch_version: str
|
||||
torch_npu_version: str
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def env_config() -> EnvConfig:
|
||||
return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
|
||||
vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
|
||||
vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
|
||||
'unknown'),
|
||||
vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
|
||||
'unknown'),
|
||||
cann_version=os.getenv('CANN_VERSION', 'unknown'),
|
||||
torch_version=os.getenv('TORCH_VERSION', 'unknown'),
|
||||
torch_npu_version=os.getenv('TORCH_NPU_VERSION',
|
||||
'unknown'))
|
||||
|
||||
|
||||
def build_model_args(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
max_model_len = eval_config.get("max_model_len", 4096)
|
||||
model_args = {
|
||||
"pretrained": eval_config["model_name"],
|
||||
"tensor_parallel_size": tp_size,
|
||||
"dtype": "auto",
|
||||
"trust_remote_code": trust_remote_code,
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
for s in [
|
||||
"max_images", "gpu_memory_utilization", "enable_expert_parallel",
|
||||
"tensor_parallel_size"
|
||||
]:
|
||||
val = eval_config.get(s, None)
|
||||
if val is not None:
|
||||
model_args[s] = val
|
||||
|
||||
print("Model Parameters:")
|
||||
print(model_args)
|
||||
|
||||
return model_args
|
||||
|
||||
|
||||
def generate_report(tp_size, eval_config, report_data, report_output,
|
||||
env_config):
|
||||
env = Environment(loader=FileSystemLoader(TEST_DIR))
|
||||
template = env.get_template("report_template.md")
|
||||
model_args = build_model_args(eval_config, tp_size)
|
||||
|
||||
report_content = template.render(
|
||||
vllm_version=env_config.vllm_version,
|
||||
vllm_commit=env_config.vllm_commit,
|
||||
vllm_ascend_version=env_config.vllm_ascend_version,
|
||||
vllm_ascend_commit=env_config.vllm_ascend_commit,
|
||||
cann_version=env_config.cann_version,
|
||||
torch_version=env_config.torch_version,
|
||||
torch_npu_version=env_config.torch_npu_version,
|
||||
model_name=eval_config["model_name"],
|
||||
model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
|
||||
model_type=eval_config.get("model", "vllm"),
|
||||
datasets=",".join([task["name"] for task in eval_config["tasks"]]),
|
||||
apply_chat_template=eval_config.get("apply_chat_template", True),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
|
||||
limit=eval_config.get("limit", None),
|
||||
batch_size="auto",
|
||||
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
||||
rows=report_data["rows"])
|
||||
|
||||
os.makedirs(os.path.dirname(report_output), exist_ok=True)
|
||||
with open(report_output, 'w', encoding='utf-8') as f:
|
||||
f.write(report_content)
|
||||
|
||||
|
||||
def test_lm_eval_correctness_param(config_filename, tp_size, report_output,
|
||||
env_config):
|
||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||
model_args = build_model_args(eval_config, tp_size)
|
||||
success = True
|
||||
report_data: dict[str, list[dict]] = {"rows": []}
|
||||
|
||||
eval_params = {
|
||||
"model": eval_config.get("model", "vllm"),
|
||||
"model_args": model_args,
|
||||
"tasks": [task["name"] for task in eval_config["tasks"]],
|
||||
"apply_chat_template": eval_config.get("apply_chat_template", True),
|
||||
"fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
|
||||
"limit": eval_config.get("limit", None),
|
||||
"batch_size": "auto",
|
||||
}
|
||||
for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
|
||||
val = eval_config.get(s, None)
|
||||
if val is not None:
|
||||
eval_params[s] = val
|
||||
|
||||
print("Eval Parameters:")
|
||||
print(eval_params)
|
||||
|
||||
results = lm_eval.simple_evaluate(**eval_params)
|
||||
|
||||
for task in eval_config["tasks"]:
|
||||
task_name = task["name"]
|
||||
task_result = results["results"][task_name]
|
||||
for metric in task["metrics"]:
|
||||
metric_name = metric["name"]
|
||||
ground_truth = metric["value"]
|
||||
measured_value = task_result[metric_name]
|
||||
task_success = bool(
|
||||
np.isclose(ground_truth, measured_value, rtol=RTOL))
|
||||
success = success and task_success
|
||||
|
||||
print(f"{task_name} | {metric_name}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value} | "
|
||||
f"success={'✅' if task_success else '❌'}")
|
||||
|
||||
report_data["rows"].append({
|
||||
"task":
|
||||
task_name,
|
||||
"metric":
|
||||
metric_name,
|
||||
"value":
|
||||
f"✅{measured_value}" if success else f"❌{measured_value}",
|
||||
"stderr":
|
||||
task_result[
|
||||
metric_name.replace(',', '_stderr,') if metric_name ==
|
||||
"acc,none" else metric_name.replace(',', '_stderr,')]
|
||||
})
|
||||
generate_report(tp_size, eval_config, report_data, report_output,
|
||||
env_config)
|
||||
assert success
|
||||
Reference in New Issue
Block a user