From 891b2bfe715619affac64c6fa339944d47ebcf1a Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 25 Aug 2025 09:39:30 +0800 Subject: [PATCH] Accuracy report formatting (#2279) ### What this PR does / why we need it? Accuracy report formatting ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/53415653ff24be03e7c90f5b42ef9cb3f72aad71 --------- Signed-off-by: Icey <1790571317@qq.com> --- tests/e2e/models/report_template.md | 17 +++++++---------- tests/e2e/models/test_lm_eval_correctness.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/e2e/models/report_template.md b/tests/e2e/models/report_template.md index ddaa9c7..8402545 100644 --- a/tests/e2e/models/report_template.md +++ b/tests/e2e/models/report_template.md @@ -1,24 +1,21 @@ # {{ model_name }} -**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), -**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }})) -**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }} -**Hardware Environment**: Atlas A2 Series -**Datasets**: {{ datasets }} -**Parallel Mode**: TP -**Execution Mode**: ACLGraph +- **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }})) +- **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }} +- **Hardware Environment**: Atlas A2 Series +- **Parallel mode**: {{ parallel_mode }} +- **Execution mode**: ACLGraph **Command**: ```bash export MODEL_ARGS={{ model_args }} lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \ ---apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \ ---limit {{ limit }} --batch_size {{ batch_size}} +{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}} ``` | Task | Metric | Value | Stderr | |-----------------------|-------------|----------:|-------:| {% for row in rows -%} -| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} | +| {{ row.task }} | {{ row.metric }} | {{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} | {% endfor %} diff --git a/tests/e2e/models/test_lm_eval_correctness.py b/tests/e2e/models/test_lm_eval_correctness.py index 567d3de..18768e1 100644 --- a/tests/e2e/models/test_lm_eval_correctness.py +++ b/tests/e2e/models/test_lm_eval_correctness.py @@ -65,6 +65,10 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config): template = env.get_template("report_template.md") model_args = build_model_args(eval_config, tp_size) + parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}" + if model_args.get('enable_expert_parallel', False): + parallel_mode += " + EP" + report_content = template.render( vllm_version=env_config.vllm_version, vllm_commit=env_config.vllm_commit, @@ -79,10 +83,11 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config): datasets=",".join([task["name"] for task in eval_config["tasks"]]), apply_chat_template=eval_config.get("apply_chat_template", True), fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True), - limit=eval_config.get("limit", None), + limit=eval_config.get("limit", "N/A"), batch_size="auto", num_fewshot=eval_config.get("num_fewshot", "N/A"), - rows=report_data["rows"]) + rows=report_data["rows"], + parallel_mode=parallel_mode) report_output = os.path.join( report_dir, f"{os.path.basename(eval_config['model_name'])}.md") @@ -123,7 +128,7 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir, for metric in task["metrics"]: metric_name = metric["name"] ground_truth = metric["value"] - measured_value = task_result[metric_name] + measured_value = round(task_result[metric_name], 4) task_success = bool( np.isclose(ground_truth, measured_value, rtol=RTOL)) success = success and task_success