Enable pytest and yaml style accuracy test (#2073)

### What this PR does / why we need it? This PR enabled pytest and yaml style accuracy test, users now can enable accuracy test by running: ```bash cd ~/vllm-ascend pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \ --config ./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml \ --report_output ./benchmarks/accuracy/Qwen3-8B-Base.md pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \ --config-list-file ./tests/e2e/singlecard/models/configs/accuracy.txt ``` Closes: https://github.com/vllm-project/vllm-ascend/issues/1970 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main: 2836dd73f1 --------- Signed-off-by: Icey <1790571317@qq.com>
2025-07-31 21:39:13 +08:00
parent 9c9a7cd90b
commit 86bdde1ca8
10 changed files with 336 additions and 446 deletions
--- a/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+model: "vllm-vlm"
+tasks:
+- name: "mmmu_val"
+  metrics:
+  - name: "acc,none"
+    value: 0.51
+max_model_len: 8192
--- a/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml
+++ b/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml
@@ -0,0 +1,18 @@
+model_name: "Qwen/Qwen3-30B-A3B"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.89
+  - name: "exact_match,flexible-extract"
+    value: 0.85
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.84
+num_fewshot: 5
+gpu_memory_utilization: 0.6
+enable_expert_parallel: True
+tensor_parallel_size: 2
+apply_chat_template: False
+fewshot_as_multiturn: False
--- a/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml
+++ b/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml
@@ -0,0 +1,13 @@
+model_name: "Qwen/Qwen3-8B-Base"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.82
+  - name: "exact_match,flexible-extract"
+    value: 0.83
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.82
+num_fewshot: 5
--- a/tests/e2e/singlecard/models/configs/accuracy.txt
+++ b/tests/e2e/singlecard/models/configs/accuracy.txt
@@ -0,0 +1,3 @@
+Qwen3-8B-Base.yaml
+Qwen2.5-VL-7B-Instruct.yaml
+Qwen3-30B-A3B.yaml
--- a/tests/e2e/singlecard/models/conftest.py
+++ b/tests/e2e/singlecard/models/conftest.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--config-list-file",
+        action="store",
+        default=None,
+        help="Path to the file listing model config YAMLs (one per line)",
+    )
+    parser.addoption(
+        "--tp-size",
+        action="store",
+        default="1",
+        help="Tensor parallel size to use for evaluation",
+    )
+    parser.addoption(
+        "--config",
+        action="store",
+        default="./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml",
+        help="Path to the model config YAML file",
+    )
+    parser.addoption(
+        "--report_output",
+        action="store",
+        default="./benchmarks/accuracy/Qwen3-8B-Base.md",
+        help="Path to the report output file",
+    )
+
+
+@pytest.fixture(scope="session")
+def config_list_file(pytestconfig, config_dir):
+    rel_path = pytestconfig.getoption("--config-list-file")
+    return config_dir / rel_path
+
+
+@pytest.fixture(scope="session")
+def tp_size(pytestconfig):
+    return pytestconfig.getoption("--tp-size")
+
+
+@pytest.fixture(scope="session")
+def config(pytestconfig):
+    return pytestconfig.getoption("--config")
+
+
+@pytest.fixture(scope="session")
+def report_output(pytestconfig):
+    return pytestconfig.getoption("--report_output")
+
+
+def pytest_generate_tests(metafunc):
+    if "config_filename" in metafunc.fixturenames:
+        # If config specified, use the --config directly
+        single_config = metafunc.config.getoption("--config")
+        if single_config:
+            metafunc.parametrize("config_filename",
+                                 [Path(single_config).resolve()])
+            return
+        # Otherwise, check --config-list-file
+        rel_path = metafunc.config.getoption("--config-list-file")
+        config_list_file = Path(rel_path).resolve()
+        config_dir = config_list_file.parent
+        with open(config_list_file, encoding="utf-8") as f:
+            configs = [
+                config_dir / line.strip() for line in f
+                if line.strip() and not line.startswith("#")
+            ]
+        metafunc.parametrize("config_filename", configs)
--- a/tests/e2e/singlecard/models/report_template.md
+++ b/tests/e2e/singlecard/models/report_template.md
@@ -0,0 +1,24 @@
+# {{ model_name }}
+
+**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})),
+**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))  
+**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }}  
+**Hardware Environment**: Atlas A2 Series  
+**Datasets**: {{ datasets }}  
+**Parallel Mode**: TP  
+**Execution Mode**: ACLGraph  
+
+**Command**:  
+
+```bash
+export MODEL_ARGS={{ model_args }}
+lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
+--apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \
+--limit {{ limit }} --batch_size {{ batch_size}}
+```
+
+| Task                  | Metric      | Value     | Stderr |
+|-----------------------|-------------|----------:|-------:|
+{% for row in rows -%}
+| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
+{% endfor %}
--- a/tests/e2e/singlecard/models/test_lm_eval_correctness.py
+++ b/tests/e2e/singlecard/models/test_lm_eval_correctness.py
@@ -0,0 +1,148 @@
+import os
+from dataclasses import dataclass
+
+import lm_eval
+import numpy as np
+import pytest
+import yaml
+from jinja2 import Environment, FileSystemLoader
+
+RTOL = 0.03
+TEST_DIR = os.path.dirname(__file__)
+
+
+@dataclass
+class EnvConfig:
+    vllm_version: str
+    vllm_commit: str
+    vllm_ascend_version: str
+    vllm_ascend_commit: str
+    cann_version: str
+    torch_version: str
+    torch_npu_version: str
+
+
+@pytest.fixture
+def env_config() -> EnvConfig:
+    return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
+                     vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
+                     vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
+                                                   'unknown'),
+                     vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
+                                                  'unknown'),
+                     cann_version=os.getenv('CANN_VERSION', 'unknown'),
+                     torch_version=os.getenv('TORCH_VERSION', 'unknown'),
+                     torch_npu_version=os.getenv('TORCH_NPU_VERSION',
+                                                 'unknown'))
+
+
+def build_model_args(eval_config, tp_size):
+    trust_remote_code = eval_config.get("trust_remote_code", False)
+    max_model_len = eval_config.get("max_model_len", 4096)
+    model_args = {
+        "pretrained": eval_config["model_name"],
+        "tensor_parallel_size": tp_size,
+        "dtype": "auto",
+        "trust_remote_code": trust_remote_code,
+        "max_model_len": max_model_len,
+    }
+    for s in [
+            "max_images", "gpu_memory_utilization", "enable_expert_parallel",
+            "tensor_parallel_size"
+    ]:
+        val = eval_config.get(s, None)
+        if val is not None:
+            model_args[s] = val
+
+    print("Model Parameters:")
+    print(model_args)
+
+    return model_args
+
+
+def generate_report(tp_size, eval_config, report_data, report_output,
+                    env_config):
+    env = Environment(loader=FileSystemLoader(TEST_DIR))
+    template = env.get_template("report_template.md")
+    model_args = build_model_args(eval_config, tp_size)
+
+    report_content = template.render(
+        vllm_version=env_config.vllm_version,
+        vllm_commit=env_config.vllm_commit,
+        vllm_ascend_version=env_config.vllm_ascend_version,
+        vllm_ascend_commit=env_config.vllm_ascend_commit,
+        cann_version=env_config.cann_version,
+        torch_version=env_config.torch_version,
+        torch_npu_version=env_config.torch_npu_version,
+        model_name=eval_config["model_name"],
+        model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
+        model_type=eval_config.get("model", "vllm"),
+        datasets=",".join([task["name"] for task in eval_config["tasks"]]),
+        apply_chat_template=eval_config.get("apply_chat_template", True),
+        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
+        limit=eval_config.get("limit", None),
+        batch_size="auto",
+        num_fewshot=eval_config.get("num_fewshot", "N/A"),
+        rows=report_data["rows"])
+
+    os.makedirs(os.path.dirname(report_output), exist_ok=True)
+    with open(report_output, 'w', encoding='utf-8') as f:
+        f.write(report_content)
+
+
+def test_lm_eval_correctness_param(config_filename, tp_size, report_output,
+                                   env_config):
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+    model_args = build_model_args(eval_config, tp_size)
+    success = True
+    report_data: dict[str, list[dict]] = {"rows": []}
+
+    eval_params = {
+        "model": eval_config.get("model", "vllm"),
+        "model_args": model_args,
+        "tasks": [task["name"] for task in eval_config["tasks"]],
+        "apply_chat_template": eval_config.get("apply_chat_template", True),
+        "fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
+        "limit": eval_config.get("limit", None),
+        "batch_size": "auto",
+    }
+    for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
+        val = eval_config.get(s, None)
+        if val is not None:
+            eval_params[s] = val
+
+    print("Eval Parameters:")
+    print(eval_params)
+
+    results = lm_eval.simple_evaluate(**eval_params)
+
+    for task in eval_config["tasks"]:
+        task_name = task["name"]
+        task_result = results["results"][task_name]
+        for metric in task["metrics"]:
+            metric_name = metric["name"]
+            ground_truth = metric["value"]
+            measured_value = task_result[metric_name]
+            task_success = bool(
+                np.isclose(ground_truth, measured_value, rtol=RTOL))
+            success = success and task_success
+
+            print(f"{task_name} | {metric_name}: "
+                  f"ground_truth={ground_truth} | measured={measured_value} | "
+                  f"success={'✅' if task_success else '❌'}")
+
+            report_data["rows"].append({
+                "task":
+                task_name,
+                "metric":
+                metric_name,
+                "value":
+                f"✅{measured_value}" if success else f"❌{measured_value}",
+                "stderr":
+                task_result[
+                    metric_name.replace(',', '_stderr,') if metric_name ==
+                    "acc,none" else metric_name.replace(',', '_stderr,')]
+            })
+    generate_report(tp_size, eval_config, report_data, report_output,
+                    env_config)
+    assert success