v0.10.1rc1

2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions
--- a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml
+++ b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml
@@ -0,0 +1,13 @@
+model_name: "deepseek-ai/DeepSeek-V2-Lite"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.375
+  - name: "exact_match,flexible-extract"
+    value: 0.375
+tensor_parallel_size: 2
+apply_chat_template: False
+fewshot_as_multiturn: False
+trust_remote_code: True
+enforce_eager: True
--- a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+model: "vllm-vlm"
+tasks:
+- name: "mmmu_val"
+  metrics:
+  - name: "acc,none"
+    value: 0.51
+max_model_len: 8192
--- a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml
+++ b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml
@@ -0,0 +1,18 @@
+model_name: "Qwen/Qwen3-30B-A3B"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.89
+  - name: "exact_match,flexible-extract"
+    value: 0.85
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.84
+num_fewshot: 5
+gpu_memory_utilization: 0.6
+enable_expert_parallel: True
+tensor_parallel_size: 2
+apply_chat_template: False
+fewshot_as_multiturn: False
--- a/tests/e2e/models/configs/Qwen3-8B-Base.yaml
+++ b/tests/e2e/models/configs/Qwen3-8B-Base.yaml
@@ -0,0 +1,13 @@
+model_name: "Qwen/Qwen3-8B-Base"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.82
+  - name: "exact_match,flexible-extract"
+    value: 0.83
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.82
+num_fewshot: 5
--- a/tests/e2e/models/configs/accuracy.txt
+++ b/tests/e2e/models/configs/accuracy.txt
@@ -0,0 +1,3 @@
+Qwen3-8B-Base.yaml
+Qwen2.5-VL-7B-Instruct.yaml
+Qwen3-30B-A3B.yaml
--- a/tests/e2e/models/conftest.py
+++ b/tests/e2e/models/conftest.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--config-list-file",
+        action="store",
+        default=None,
+        help="Path to the file listing model config YAMLs (one per line)",
+    )
+    parser.addoption(
+        "--tp-size",
+        action="store",
+        default="1",
+        help="Tensor parallel size to use for evaluation",
+    )
+    parser.addoption(
+        "--config",
+        action="store",
+        default="./tests/e2e/models/configs/Qwen3-8B-Base.yaml",
+        help="Path to the model config YAML file",
+    )
+    parser.addoption(
+        "--report-dir",
+        action="store",
+        default="./benchmarks/accuracy",
+        help="Directory to store report files",
+    )
+
+
+@pytest.fixture(scope="session")
+def config_list_file(pytestconfig, config_dir):
+    rel_path = pytestconfig.getoption("--config-list-file")
+    return config_dir / rel_path
+
+
+@pytest.fixture(scope="session")
+def tp_size(pytestconfig):
+    return pytestconfig.getoption("--tp-size")
+
+
+@pytest.fixture(scope="session")
+def config(pytestconfig):
+    return pytestconfig.getoption("--config")
+
+
+@pytest.fixture(scope="session")
+def report_dir(pytestconfig):
+    return pytestconfig.getoption("report_dir")
+
+
+def pytest_generate_tests(metafunc):
+    if "config_filename" in metafunc.fixturenames:
+
+        if metafunc.config.getoption("--config-list-file"):
+            rel_path = metafunc.config.getoption("--config-list-file")
+            config_list_file = Path(rel_path).resolve()
+            config_dir = config_list_file.parent
+            with open(config_list_file, encoding="utf-8") as f:
+                configs = [
+                    config_dir / line.strip() for line in f
+                    if line.strip() and not line.startswith("#")
+                ]
+            metafunc.parametrize("config_filename", configs)
+        else:
+            single_config = metafunc.config.getoption("--config")
+            config_path = Path(single_config).resolve()
+            metafunc.parametrize("config_filename", [config_path])
--- a/tests/e2e/models/report_template.md
+++ b/tests/e2e/models/report_template.md
@@ -0,0 +1,21 @@
+# {{ model_name }}
+
+- **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))  
+- **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }}  
+- **Hardware Environment**: Atlas A2 Series  
+- **Parallel mode**: {{ parallel_mode }}
+- **Execution mode**: ACLGraph
+
+**Command**:  
+
+```bash
+export MODEL_ARGS={{ model_args }}
+lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
+{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}}
+```
+
+| Task                  | Metric      | Value     | Stderr |
+|-----------------------|-------------|----------:|-------:|
+{% for row in rows -%}
+| {{ row.task }} | {{ row.metric }} | {{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
+{% endfor %}
--- a/tests/e2e/models/test_lm_eval_correctness.py
+++ b/tests/e2e/models/test_lm_eval_correctness.py
@@ -0,0 +1,153 @@
+import os
+from dataclasses import dataclass
+
+import lm_eval
+import numpy as np
+import pytest
+import yaml
+from jinja2 import Environment, FileSystemLoader
+
+RTOL = 0.03
+TEST_DIR = os.path.dirname(__file__)
+
+
+@dataclass
+class EnvConfig:
+    vllm_version: str
+    vllm_commit: str
+    vllm_ascend_version: str
+    vllm_ascend_commit: str
+    cann_version: str
+    torch_version: str
+    torch_npu_version: str
+
+
+@pytest.fixture
+def env_config() -> EnvConfig:
+    return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
+                     vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
+                     vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
+                                                   'unknown'),
+                     vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
+                                                  'unknown'),
+                     cann_version=os.getenv('CANN_VERSION', 'unknown'),
+                     torch_version=os.getenv('TORCH_VERSION', 'unknown'),
+                     torch_npu_version=os.getenv('TORCH_NPU_VERSION',
+                                                 'unknown'))
+
+
+def build_model_args(eval_config, tp_size):
+    trust_remote_code = eval_config.get("trust_remote_code", False)
+    max_model_len = eval_config.get("max_model_len", 4096)
+    model_args = {
+        "pretrained": eval_config["model_name"],
+        "tensor_parallel_size": tp_size,
+        "dtype": "auto",
+        "trust_remote_code": trust_remote_code,
+        "max_model_len": max_model_len,
+    }
+    for s in [
+            "max_images", "gpu_memory_utilization", "enable_expert_parallel",
+            "tensor_parallel_size", "enforce_eager"
+    ]:
+        val = eval_config.get(s, None)
+        if val is not None:
+            model_args[s] = val
+
+    print("Model Parameters:")
+    print(model_args)
+
+    return model_args
+
+
+def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
+    env = Environment(loader=FileSystemLoader(TEST_DIR))
+    template = env.get_template("report_template.md")
+    model_args = build_model_args(eval_config, tp_size)
+
+    parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}"
+    if model_args.get('enable_expert_parallel', False):
+        parallel_mode += " + EP"
+
+    report_content = template.render(
+        vllm_version=env_config.vllm_version,
+        vllm_commit=env_config.vllm_commit,
+        vllm_ascend_version=env_config.vllm_ascend_version,
+        vllm_ascend_commit=env_config.vllm_ascend_commit,
+        cann_version=env_config.cann_version,
+        torch_version=env_config.torch_version,
+        torch_npu_version=env_config.torch_npu_version,
+        model_name=eval_config["model_name"],
+        model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
+        model_type=eval_config.get("model", "vllm"),
+        datasets=",".join([task["name"] for task in eval_config["tasks"]]),
+        apply_chat_template=eval_config.get("apply_chat_template", True),
+        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
+        limit=eval_config.get("limit", "N/A"),
+        batch_size="auto",
+        num_fewshot=eval_config.get("num_fewshot", "N/A"),
+        rows=report_data["rows"],
+        parallel_mode=parallel_mode)
+
+    report_output = os.path.join(
+        report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
+    os.makedirs(os.path.dirname(report_output), exist_ok=True)
+    with open(report_output, 'w', encoding='utf-8') as f:
+        f.write(report_content)
+
+
+def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
+                                   env_config):
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+    model_args = build_model_args(eval_config, tp_size)
+    success = True
+    report_data: dict[str, list[dict]] = {"rows": []}
+
+    eval_params = {
+        "model": eval_config.get("model", "vllm"),
+        "model_args": model_args,
+        "tasks": [task["name"] for task in eval_config["tasks"]],
+        "apply_chat_template": eval_config.get("apply_chat_template", True),
+        "fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
+        "limit": eval_config.get("limit", None),
+        "batch_size": "auto",
+    }
+    for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
+        val = eval_config.get(s, None)
+        if val is not None:
+            eval_params[s] = val
+
+    print("Eval Parameters:")
+    print(eval_params)
+
+    results = lm_eval.simple_evaluate(**eval_params)
+
+    for task in eval_config["tasks"]:
+        task_name = task["name"]
+        task_result = results["results"][task_name]
+        for metric in task["metrics"]:
+            metric_name = metric["name"]
+            ground_truth = metric["value"]
+            measured_value = round(task_result[metric_name], 4)
+            task_success = bool(
+                np.isclose(ground_truth, measured_value, rtol=RTOL))
+            success = success and task_success
+
+            print(f"{task_name} | {metric_name}: "
+                  f"ground_truth={ground_truth} | measured={measured_value} | "
+                  f"success={'✅' if task_success else '❌'}")
+
+            report_data["rows"].append({
+                "task":
+                task_name,
+                "metric":
+                metric_name,
+                "value":
+                f"✅{measured_value}" if success else f"❌{measured_value}",
+                "stderr":
+                task_result[
+                    metric_name.replace(',', '_stderr,') if metric_name ==
+                    "acc,none" else metric_name.replace(',', '_stderr,')]
+            })
+    generate_report(tp_size, eval_config, report_data, report_dir, env_config)
+    assert success