v0.10.1rc1

This commit is contained in:
2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions

View File

@@ -0,0 +1,13 @@
model_name: "deepseek-ai/DeepSeek-V2-Lite"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.375
- name: "exact_match,flexible-extract"
value: 0.375
tensor_parallel_size: 2
apply_chat_template: False
fewshot_as_multiturn: False
trust_remote_code: True
enforce_eager: True

View File

@@ -0,0 +1,8 @@
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
model: "vllm-vlm"
tasks:
- name: "mmmu_val"
metrics:
- name: "acc,none"
value: 0.51
max_model_len: 8192

View File

@@ -0,0 +1,18 @@
model_name: "Qwen/Qwen3-30B-A3B"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.89
- name: "exact_match,flexible-extract"
value: 0.85
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.84
num_fewshot: 5
gpu_memory_utilization: 0.6
enable_expert_parallel: True
tensor_parallel_size: 2
apply_chat_template: False
fewshot_as_multiturn: False

View File

@@ -0,0 +1,13 @@
model_name: "Qwen/Qwen3-8B-Base"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.82
- name: "exact_match,flexible-extract"
value: 0.83
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.82
num_fewshot: 5

View File

@@ -0,0 +1,3 @@
Qwen3-8B-Base.yaml
Qwen2.5-VL-7B-Instruct.yaml
Qwen3-30B-A3B.yaml

View File

@@ -0,0 +1,72 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import pytest
def pytest_addoption(parser):
parser.addoption(
"--config-list-file",
action="store",
default=None,
help="Path to the file listing model config YAMLs (one per line)",
)
parser.addoption(
"--tp-size",
action="store",
default="1",
help="Tensor parallel size to use for evaluation",
)
parser.addoption(
"--config",
action="store",
default="./tests/e2e/models/configs/Qwen3-8B-Base.yaml",
help="Path to the model config YAML file",
)
parser.addoption(
"--report-dir",
action="store",
default="./benchmarks/accuracy",
help="Directory to store report files",
)
@pytest.fixture(scope="session")
def config_list_file(pytestconfig, config_dir):
rel_path = pytestconfig.getoption("--config-list-file")
return config_dir / rel_path
@pytest.fixture(scope="session")
def tp_size(pytestconfig):
return pytestconfig.getoption("--tp-size")
@pytest.fixture(scope="session")
def config(pytestconfig):
return pytestconfig.getoption("--config")
@pytest.fixture(scope="session")
def report_dir(pytestconfig):
return pytestconfig.getoption("report_dir")
def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames:
if metafunc.config.getoption("--config-list-file"):
rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f:
configs = [
config_dir / line.strip() for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs)
else:
single_config = metafunc.config.getoption("--config")
config_path = Path(single_config).resolve()
metafunc.parametrize("config_filename", [config_path])

View File

@@ -0,0 +1,21 @@
# {{ model_name }}
- **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
- **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }}
- **Hardware Environment**: Atlas A2 Series
- **Parallel mode**: {{ parallel_mode }}
- **Execution mode**: ACLGraph
**Command**:
```bash
export MODEL_ARGS={{ model_args }}
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}}
```
| Task | Metric | Value | Stderr |
|-----------------------|-------------|----------:|-------:|
{% for row in rows -%}
| {{ row.task }} | {{ row.metric }} | {{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
{% endfor %}

View File

@@ -0,0 +1,153 @@
import os
from dataclasses import dataclass
import lm_eval
import numpy as np
import pytest
import yaml
from jinja2 import Environment, FileSystemLoader
RTOL = 0.03
TEST_DIR = os.path.dirname(__file__)
@dataclass
class EnvConfig:
vllm_version: str
vllm_commit: str
vllm_ascend_version: str
vllm_ascend_commit: str
cann_version: str
torch_version: str
torch_npu_version: str
@pytest.fixture
def env_config() -> EnvConfig:
return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
'unknown'),
vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
'unknown'),
cann_version=os.getenv('CANN_VERSION', 'unknown'),
torch_version=os.getenv('TORCH_VERSION', 'unknown'),
torch_npu_version=os.getenv('TORCH_NPU_VERSION',
'unknown'))
def build_model_args(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
model_args = {
"pretrained": eval_config["model_name"],
"tensor_parallel_size": tp_size,
"dtype": "auto",
"trust_remote_code": trust_remote_code,
"max_model_len": max_model_len,
}
for s in [
"max_images", "gpu_memory_utilization", "enable_expert_parallel",
"tensor_parallel_size", "enforce_eager"
]:
val = eval_config.get(s, None)
if val is not None:
model_args[s] = val
print("Model Parameters:")
print(model_args)
return model_args
def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
env = Environment(loader=FileSystemLoader(TEST_DIR))
template = env.get_template("report_template.md")
model_args = build_model_args(eval_config, tp_size)
parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}"
if model_args.get('enable_expert_parallel', False):
parallel_mode += " + EP"
report_content = template.render(
vllm_version=env_config.vllm_version,
vllm_commit=env_config.vllm_commit,
vllm_ascend_version=env_config.vllm_ascend_version,
vllm_ascend_commit=env_config.vllm_ascend_commit,
cann_version=env_config.cann_version,
torch_version=env_config.torch_version,
torch_npu_version=env_config.torch_npu_version,
model_name=eval_config["model_name"],
model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
model_type=eval_config.get("model", "vllm"),
datasets=",".join([task["name"] for task in eval_config["tasks"]]),
apply_chat_template=eval_config.get("apply_chat_template", True),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
limit=eval_config.get("limit", "N/A"),
batch_size="auto",
num_fewshot=eval_config.get("num_fewshot", "N/A"),
rows=report_data["rows"],
parallel_mode=parallel_mode)
report_output = os.path.join(
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
os.makedirs(os.path.dirname(report_output), exist_ok=True)
with open(report_output, 'w', encoding='utf-8') as f:
f.write(report_content)
def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
env_config):
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
model_args = build_model_args(eval_config, tp_size)
success = True
report_data: dict[str, list[dict]] = {"rows": []}
eval_params = {
"model": eval_config.get("model", "vllm"),
"model_args": model_args,
"tasks": [task["name"] for task in eval_config["tasks"]],
"apply_chat_template": eval_config.get("apply_chat_template", True),
"fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
"limit": eval_config.get("limit", None),
"batch_size": "auto",
}
for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
val = eval_config.get(s, None)
if val is not None:
eval_params[s] = val
print("Eval Parameters:")
print(eval_params)
results = lm_eval.simple_evaluate(**eval_params)
for task in eval_config["tasks"]:
task_name = task["name"]
task_result = results["results"][task_name]
for metric in task["metrics"]:
metric_name = metric["name"]
ground_truth = metric["value"]
measured_value = round(task_result[metric_name], 4)
task_success = bool(
np.isclose(ground_truth, measured_value, rtol=RTOL))
success = success and task_success
print(f"{task_name} | {metric_name}: "
f"ground_truth={ground_truth} | measured={measured_value} | "
f"success={'' if task_success else ''}")
report_data["rows"].append({
"task":
task_name,
"metric":
metric_name,
"value":
f"{measured_value}" if success else f"{measured_value}",
"stderr":
task_result[
metric_name.replace(',', '_stderr,') if metric_name ==
"acc,none" else metric_name.replace(',', '_stderr,')]
})
generate_report(tp_size, eval_config, report_data, report_dir, env_config)
assert success