Accuracy report formatting (#2279)
### What this PR does / why we need it?
Accuracy report formatting
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.10.0
- vLLM main:
53415653ff
---------
Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
@@ -1,24 +1,21 @@
|
|||||||
# {{ model_name }}
|
# {{ model_name }}
|
||||||
|
|
||||||
**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})),
|
- **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
|
||||||
**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
|
- **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }}
|
||||||
**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }}
|
- **Hardware Environment**: Atlas A2 Series
|
||||||
**Hardware Environment**: Atlas A2 Series
|
- **Parallel mode**: {{ parallel_mode }}
|
||||||
**Datasets**: {{ datasets }}
|
- **Execution mode**: ACLGraph
|
||||||
**Parallel Mode**: TP
|
|
||||||
**Execution Mode**: ACLGraph
|
|
||||||
|
|
||||||
**Command**:
|
**Command**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export MODEL_ARGS={{ model_args }}
|
export MODEL_ARGS={{ model_args }}
|
||||||
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
|
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
|
||||||
--apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \
|
{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}}
|
||||||
--limit {{ limit }} --batch_size {{ batch_size}}
|
|
||||||
```
|
```
|
||||||
|
|
||||||
| Task | Metric | Value | Stderr |
|
| Task | Metric | Value | Stderr |
|
||||||
|-----------------------|-------------|----------:|-------:|
|
|-----------------------|-------------|----------:|-------:|
|
||||||
{% for row in rows -%}
|
{% for row in rows -%}
|
||||||
| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
|
| {{ row.task }} | {{ row.metric }} | {{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|||||||
@@ -65,6 +65,10 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
|||||||
template = env.get_template("report_template.md")
|
template = env.get_template("report_template.md")
|
||||||
model_args = build_model_args(eval_config, tp_size)
|
model_args = build_model_args(eval_config, tp_size)
|
||||||
|
|
||||||
|
parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}"
|
||||||
|
if model_args.get('enable_expert_parallel', False):
|
||||||
|
parallel_mode += " + EP"
|
||||||
|
|
||||||
report_content = template.render(
|
report_content = template.render(
|
||||||
vllm_version=env_config.vllm_version,
|
vllm_version=env_config.vllm_version,
|
||||||
vllm_commit=env_config.vllm_commit,
|
vllm_commit=env_config.vllm_commit,
|
||||||
@@ -79,10 +83,11 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
|||||||
datasets=",".join([task["name"] for task in eval_config["tasks"]]),
|
datasets=",".join([task["name"] for task in eval_config["tasks"]]),
|
||||||
apply_chat_template=eval_config.get("apply_chat_template", True),
|
apply_chat_template=eval_config.get("apply_chat_template", True),
|
||||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
|
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
|
||||||
limit=eval_config.get("limit", None),
|
limit=eval_config.get("limit", "N/A"),
|
||||||
batch_size="auto",
|
batch_size="auto",
|
||||||
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
||||||
rows=report_data["rows"])
|
rows=report_data["rows"],
|
||||||
|
parallel_mode=parallel_mode)
|
||||||
|
|
||||||
report_output = os.path.join(
|
report_output = os.path.join(
|
||||||
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
|
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
|
||||||
@@ -123,7 +128,7 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
|
|||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
metric_name = metric["name"]
|
metric_name = metric["name"]
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = task_result[metric_name]
|
measured_value = round(task_result[metric_name], 4)
|
||||||
task_success = bool(
|
task_success = bool(
|
||||||
np.isclose(ground_truth, measured_value, rtol=RTOL))
|
np.isclose(ground_truth, measured_value, rtol=RTOL))
|
||||||
success = success and task_success
|
success = success and task_success
|
||||||
|
|||||||
Reference in New Issue
Block a user