初始化项目,由ModelHub XC社区提供模型
Model: OdaxAI/DANTE-Mosaic-3.5B Source: Original Platform
This commit is contained in:
212
evaluation/parse_canonical_results.py
Normal file
212
evaluation/parse_canonical_results.py
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
parse_canonical_results.py
|
||||
==========================
|
||||
Reads all lm-eval and bigcode JSON outputs from evaluation/results/canonical/
|
||||
and produces:
|
||||
1. A clean summary table printed to stdout
|
||||
2. evaluation/results/canonical/CANONICAL_SUMMARY.json
|
||||
3. evaluation/results/canonical/CANONICAL_PROVENANCE.md
|
||||
|
||||
Usage:
|
||||
python3 evaluation/parse_canonical_results.py
|
||||
python3 evaluation/parse_canonical_results.py --results-dir evaluation/results/canonical
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ─── Metric keys per task ────────────────────────────────────────────────────
|
||||
TASK_META = {
|
||||
"mmlu": {"label": "MMLU", "metric": "acc,none", "fewshot": 5, "n": 14042},
|
||||
"mmlu_pro": {"label": "MMLU-Pro", "metric": "acc,none", "fewshot": 5, "n": 4500},
|
||||
"gsm8k": {"label": "GSM8K", "metric": "exact_match,strict-match", "fewshot": 8, "n": 1319},
|
||||
"arc_challenge": {"label": "ARC-Challenge", "metric": "acc_norm,none", "fewshot": 25, "n": 1172},
|
||||
"hellaswag": {"label": "HellaSwag", "metric": "acc_norm,none", "fewshot": 10, "n": 10042},
|
||||
"truthfulqa_mc2": {"label": "TruthfulQA MC2", "metric": "mc2,none", "fewshot": 0, "n": 817},
|
||||
"winogrande": {"label": "Winogrande", "metric": "acc,none", "fewshot": 5, "n": 1267},
|
||||
"ifeval": {"label": "IFEval", "metric": "prompt_level_strict_acc,none", "fewshot": 0, "n": 541},
|
||||
}
|
||||
|
||||
CODE_TASKS = {
|
||||
"humaneval": {"label": "HumanEval", "metric": "pass@1", "fewshot": 0, "n": 164},
|
||||
"mbpp": {"label": "MBPP", "metric": "pass@1", "fewshot": 0, "n": 374},
|
||||
}
|
||||
|
||||
|
||||
def find_latest(pattern: str) -> str | None:
|
||||
files = sorted(glob.glob(pattern))
|
||||
return files[-1] if files else None
|
||||
|
||||
|
||||
def parse_lm_eval(results_dir: str) -> dict:
|
||||
scores = {}
|
||||
for task_key, meta in TASK_META.items():
|
||||
pattern = f"{results_dir}/{task_key}_*.json"
|
||||
latest = find_latest(pattern)
|
||||
if not latest:
|
||||
continue
|
||||
try:
|
||||
with open(latest) as f:
|
||||
data = json.load(f)
|
||||
res = data.get("results", {})
|
||||
if task_key in res:
|
||||
raw = res[task_key].get(meta["metric"])
|
||||
if raw is not None:
|
||||
mtime = os.path.getmtime(latest)
|
||||
scores[task_key] = {
|
||||
"label": meta["label"],
|
||||
"score": round(raw * 100, 2),
|
||||
"metric": meta["metric"],
|
||||
"fewshot": meta["fewshot"],
|
||||
"n": meta["n"],
|
||||
"source": latest,
|
||||
"date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
|
||||
"harness": "lm-evaluation-harness 0.4.5",
|
||||
"model": data.get("config", {}).get("model_args", "OdaxAI/DANTE-Mosaic-3.5B"),
|
||||
"dtype": "bfloat16",
|
||||
"device": "NVIDIA A100-40GB",
|
||||
"seed": 42,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr)
|
||||
return scores
|
||||
|
||||
|
||||
def parse_code_eval(results_dir: str) -> dict:
|
||||
scores = {}
|
||||
for task_key, meta in CODE_TASKS.items():
|
||||
# bigcode saves to subdir
|
||||
pattern = f"{results_dir}/{task_key}_*/{task_key}_metrics.json"
|
||||
latest = find_latest(pattern)
|
||||
if not latest:
|
||||
continue
|
||||
try:
|
||||
with open(latest) as f:
|
||||
data = json.load(f)
|
||||
raw = data.get("pass@1")
|
||||
if raw is not None:
|
||||
mtime = os.path.getmtime(latest)
|
||||
scores[task_key] = {
|
||||
"label": meta["label"],
|
||||
"score": round(raw * 100, 2),
|
||||
"metric": "pass@1",
|
||||
"fewshot": 0,
|
||||
"n": meta["n"],
|
||||
"source": latest,
|
||||
"date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
|
||||
"harness": "bigcode-evaluation-harness",
|
||||
"model": "OdaxAI/DANTE-Mosaic-3.5B",
|
||||
"dtype": "bfloat16",
|
||||
"device": "NVIDIA A100-40GB",
|
||||
"seed": 0,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr)
|
||||
return scores
|
||||
|
||||
|
||||
def print_table(all_scores: dict) -> None:
|
||||
SEP = "=" * 78
|
||||
print(f"\n{SEP}")
|
||||
print(" CANONICAL BENCHMARK RESULTS — OdaxAI/DANTE-Mosaic-3.5B")
|
||||
print(" All results produced by official evaluation harnesses on Leonardo HPC")
|
||||
print(f"{SEP}")
|
||||
print(f" {'Benchmark':<20} {'N':>6} {'Few-shot':>8} {'Metric':<20} {'Score':>8}")
|
||||
print(" " + "-" * 72)
|
||||
for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
|
||||
print(f" {v['label']:<20} {v['n']:>6} {v['fewshot']:>8}-shot "
|
||||
f"{v['metric']:<20} {v['score']:>7.2f}%")
|
||||
print(f"{SEP}\n")
|
||||
|
||||
|
||||
def write_summary(all_scores: dict, results_dir: str) -> None:
|
||||
out = {
|
||||
"model": "OdaxAI/DANTE-Mosaic-3.5B",
|
||||
"type": "REAL_CANONICAL_RUN",
|
||||
"harness": "lm-evaluation-harness 0.4.5 + bigcode-evaluation-harness",
|
||||
"hardware": "NVIDIA A100-40GB, BF16",
|
||||
"cluster": "CINECA Leonardo Booster",
|
||||
"seed": 42,
|
||||
"results": all_scores,
|
||||
}
|
||||
path = f"{results_dir}/CANONICAL_SUMMARY.json"
|
||||
with open(path, "w") as f:
|
||||
json.dump(out, f, indent=2)
|
||||
print(f" Summary JSON -> {path}")
|
||||
|
||||
|
||||
def write_provenance(all_scores: dict, results_dir: str) -> None:
|
||||
lines = [
|
||||
"# Canonical Benchmark Provenance — DANTE-Mosaic-3.5B",
|
||||
"",
|
||||
"All results in this table are **REAL_CANONICAL_RUN** — produced by official",
|
||||
"evaluation harnesses on the `OdaxAI/DANTE-Mosaic-3.5B` checkpoint.",
|
||||
"They are directly comparable to published leaderboard scores.",
|
||||
"",
|
||||
"| Benchmark | Harness | Task name | N | Few-shot | Metric | Score | Date | Source JSON |",
|
||||
"|-----------|---------|-----------|---|----------|--------|-------|------|-------------|",
|
||||
]
|
||||
for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
|
||||
src = os.path.basename(v["source"])
|
||||
lines.append(
|
||||
f"| {v['label']} | {v['harness']} | `{k}` | {v['n']} | "
|
||||
f"{v['fewshot']}-shot | `{v['metric']}` | **{v['score']:.2f}%** | "
|
||||
f"{v['date']} | `{src}` |"
|
||||
)
|
||||
lines += [
|
||||
"",
|
||||
"## Hardware & Software",
|
||||
"",
|
||||
"| Property | Value |",
|
||||
"|----------|-------|",
|
||||
"| GPU | NVIDIA A100-SXM-40GB |",
|
||||
"| Precision | BF16 |",
|
||||
"| Cluster | CINECA Leonardo Booster |",
|
||||
"| lm-eval version | 0.4.5 |",
|
||||
"| Seed | 42 |",
|
||||
"",
|
||||
"## Comparability Note",
|
||||
"",
|
||||
"These canonical scores are produced under standard protocols and are directly",
|
||||
"comparable to published scores from the same harness versions.",
|
||||
"Internal offline subset scores (30/40/25 problems from `_benchmark_dante_offline.py`)",
|
||||
"are **separate** and must not be mixed with these canonical results.",
|
||||
]
|
||||
path = f"{results_dir}/CANONICAL_PROVENANCE.md"
|
||||
with open(path, "w") as f:
|
||||
f.write("\n".join(lines) + "\n")
|
||||
print(f" Provenance -> {path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--results-dir", default="evaluation/results/canonical")
|
||||
args = parser.parse_args()
|
||||
|
||||
rd = args.results_dir
|
||||
print(f"\nParsing results from: {rd}")
|
||||
|
||||
lm = parse_lm_eval(rd)
|
||||
code = parse_code_eval(rd)
|
||||
all_scores = {**lm, **code}
|
||||
|
||||
if not all_scores:
|
||||
print("No canonical results found yet.")
|
||||
print("Run evaluation/slurm_lm_eval.slurm and slurm_code_eval.slurm on Leonardo first.")
|
||||
sys.exit(0)
|
||||
|
||||
print_table(all_scores)
|
||||
write_summary(all_scores, rd)
|
||||
write_provenance(all_scores, rd)
|
||||
print("\nDone.\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user