213 lines
8.3 KiB
Python
213 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
parse_canonical_results.py
|
|
==========================
|
|
Reads all lm-eval and bigcode JSON outputs from evaluation/results/canonical/
|
|
and produces:
|
|
1. A clean summary table printed to stdout
|
|
2. evaluation/results/canonical/CANONICAL_SUMMARY.json
|
|
3. evaluation/results/canonical/CANONICAL_PROVENANCE.md
|
|
|
|
Usage:
|
|
python3 evaluation/parse_canonical_results.py
|
|
python3 evaluation/parse_canonical_results.py --results-dir evaluation/results/canonical
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import glob
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# ─── Metric keys per task ────────────────────────────────────────────────────
|
|
TASK_META = {
|
|
"mmlu": {"label": "MMLU", "metric": "acc,none", "fewshot": 5, "n": 14042},
|
|
"mmlu_pro": {"label": "MMLU-Pro", "metric": "acc,none", "fewshot": 5, "n": 4500},
|
|
"gsm8k": {"label": "GSM8K", "metric": "exact_match,strict-match", "fewshot": 8, "n": 1319},
|
|
"arc_challenge": {"label": "ARC-Challenge", "metric": "acc_norm,none", "fewshot": 25, "n": 1172},
|
|
"hellaswag": {"label": "HellaSwag", "metric": "acc_norm,none", "fewshot": 10, "n": 10042},
|
|
"truthfulqa_mc2": {"label": "TruthfulQA MC2", "metric": "mc2,none", "fewshot": 0, "n": 817},
|
|
"winogrande": {"label": "Winogrande", "metric": "acc,none", "fewshot": 5, "n": 1267},
|
|
"ifeval": {"label": "IFEval", "metric": "prompt_level_strict_acc,none", "fewshot": 0, "n": 541},
|
|
}
|
|
|
|
CODE_TASKS = {
|
|
"humaneval": {"label": "HumanEval", "metric": "pass@1", "fewshot": 0, "n": 164},
|
|
"mbpp": {"label": "MBPP", "metric": "pass@1", "fewshot": 0, "n": 374},
|
|
}
|
|
|
|
|
|
def find_latest(pattern: str) -> str | None:
|
|
files = sorted(glob.glob(pattern))
|
|
return files[-1] if files else None
|
|
|
|
|
|
def parse_lm_eval(results_dir: str) -> dict:
|
|
scores = {}
|
|
for task_key, meta in TASK_META.items():
|
|
pattern = f"{results_dir}/{task_key}_*.json"
|
|
latest = find_latest(pattern)
|
|
if not latest:
|
|
continue
|
|
try:
|
|
with open(latest) as f:
|
|
data = json.load(f)
|
|
res = data.get("results", {})
|
|
if task_key in res:
|
|
raw = res[task_key].get(meta["metric"])
|
|
if raw is not None:
|
|
mtime = os.path.getmtime(latest)
|
|
scores[task_key] = {
|
|
"label": meta["label"],
|
|
"score": round(raw * 100, 2),
|
|
"metric": meta["metric"],
|
|
"fewshot": meta["fewshot"],
|
|
"n": meta["n"],
|
|
"source": latest,
|
|
"date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
|
|
"harness": "lm-evaluation-harness 0.4.5",
|
|
"model": data.get("config", {}).get("model_args", "OdaxAI/DANTE-Mosaic-3.5B"),
|
|
"dtype": "bfloat16",
|
|
"device": "NVIDIA A100-40GB",
|
|
"seed": 42,
|
|
}
|
|
except Exception as e:
|
|
print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr)
|
|
return scores
|
|
|
|
|
|
def parse_code_eval(results_dir: str) -> dict:
|
|
scores = {}
|
|
for task_key, meta in CODE_TASKS.items():
|
|
# bigcode saves to subdir
|
|
pattern = f"{results_dir}/{task_key}_*/{task_key}_metrics.json"
|
|
latest = find_latest(pattern)
|
|
if not latest:
|
|
continue
|
|
try:
|
|
with open(latest) as f:
|
|
data = json.load(f)
|
|
raw = data.get("pass@1")
|
|
if raw is not None:
|
|
mtime = os.path.getmtime(latest)
|
|
scores[task_key] = {
|
|
"label": meta["label"],
|
|
"score": round(raw * 100, 2),
|
|
"metric": "pass@1",
|
|
"fewshot": 0,
|
|
"n": meta["n"],
|
|
"source": latest,
|
|
"date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
|
|
"harness": "bigcode-evaluation-harness",
|
|
"model": "OdaxAI/DANTE-Mosaic-3.5B",
|
|
"dtype": "bfloat16",
|
|
"device": "NVIDIA A100-40GB",
|
|
"seed": 0,
|
|
}
|
|
except Exception as e:
|
|
print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr)
|
|
return scores
|
|
|
|
|
|
def print_table(all_scores: dict) -> None:
|
|
SEP = "=" * 78
|
|
print(f"\n{SEP}")
|
|
print(" CANONICAL BENCHMARK RESULTS — OdaxAI/DANTE-Mosaic-3.5B")
|
|
print(" All results produced by official evaluation harnesses on Leonardo HPC")
|
|
print(f"{SEP}")
|
|
print(f" {'Benchmark':<20} {'N':>6} {'Few-shot':>8} {'Metric':<20} {'Score':>8}")
|
|
print(" " + "-" * 72)
|
|
for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
|
|
print(f" {v['label']:<20} {v['n']:>6} {v['fewshot']:>8}-shot "
|
|
f"{v['metric']:<20} {v['score']:>7.2f}%")
|
|
print(f"{SEP}\n")
|
|
|
|
|
|
def write_summary(all_scores: dict, results_dir: str) -> None:
|
|
out = {
|
|
"model": "OdaxAI/DANTE-Mosaic-3.5B",
|
|
"type": "REAL_CANONICAL_RUN",
|
|
"harness": "lm-evaluation-harness 0.4.5 + bigcode-evaluation-harness",
|
|
"hardware": "NVIDIA A100-40GB, BF16",
|
|
"cluster": "CINECA Leonardo Booster",
|
|
"seed": 42,
|
|
"results": all_scores,
|
|
}
|
|
path = f"{results_dir}/CANONICAL_SUMMARY.json"
|
|
with open(path, "w") as f:
|
|
json.dump(out, f, indent=2)
|
|
print(f" Summary JSON -> {path}")
|
|
|
|
|
|
def write_provenance(all_scores: dict, results_dir: str) -> None:
|
|
lines = [
|
|
"# Canonical Benchmark Provenance — DANTE-Mosaic-3.5B",
|
|
"",
|
|
"All results in this table are **REAL_CANONICAL_RUN** — produced by official",
|
|
"evaluation harnesses on the `OdaxAI/DANTE-Mosaic-3.5B` checkpoint.",
|
|
"They are directly comparable to published leaderboard scores.",
|
|
"",
|
|
"| Benchmark | Harness | Task name | N | Few-shot | Metric | Score | Date | Source JSON |",
|
|
"|-----------|---------|-----------|---|----------|--------|-------|------|-------------|",
|
|
]
|
|
for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
|
|
src = os.path.basename(v["source"])
|
|
lines.append(
|
|
f"| {v['label']} | {v['harness']} | `{k}` | {v['n']} | "
|
|
f"{v['fewshot']}-shot | `{v['metric']}` | **{v['score']:.2f}%** | "
|
|
f"{v['date']} | `{src}` |"
|
|
)
|
|
lines += [
|
|
"",
|
|
"## Hardware & Software",
|
|
"",
|
|
"| Property | Value |",
|
|
"|----------|-------|",
|
|
"| GPU | NVIDIA A100-SXM-40GB |",
|
|
"| Precision | BF16 |",
|
|
"| Cluster | CINECA Leonardo Booster |",
|
|
"| lm-eval version | 0.4.5 |",
|
|
"| Seed | 42 |",
|
|
"",
|
|
"## Comparability Note",
|
|
"",
|
|
"These canonical scores are produced under standard protocols and are directly",
|
|
"comparable to published scores from the same harness versions.",
|
|
"Internal offline subset scores (30/40/25 problems from `_benchmark_dante_offline.py`)",
|
|
"are **separate** and must not be mixed with these canonical results.",
|
|
]
|
|
path = f"{results_dir}/CANONICAL_PROVENANCE.md"
|
|
with open(path, "w") as f:
|
|
f.write("\n".join(lines) + "\n")
|
|
print(f" Provenance -> {path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--results-dir", default="evaluation/results/canonical")
|
|
args = parser.parse_args()
|
|
|
|
rd = args.results_dir
|
|
print(f"\nParsing results from: {rd}")
|
|
|
|
lm = parse_lm_eval(rd)
|
|
code = parse_code_eval(rd)
|
|
all_scores = {**lm, **code}
|
|
|
|
if not all_scores:
|
|
print("No canonical results found yet.")
|
|
print("Run evaluation/slurm_lm_eval.slurm and slurm_code_eval.slurm on Leonardo first.")
|
|
sys.exit(0)
|
|
|
|
print_table(all_scores)
|
|
write_summary(all_scores, rd)
|
|
write_provenance(all_scores, rd)
|
|
print("\nDone.\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|