#!/usr/bin/env python3 """ parse_canonical_results.py ========================== Reads all lm-eval and bigcode JSON outputs from evaluation/results/canonical/ and produces: 1. A clean summary table printed to stdout 2. evaluation/results/canonical/CANONICAL_SUMMARY.json 3. evaluation/results/canonical/CANONICAL_PROVENANCE.md Usage: python3 evaluation/parse_canonical_results.py python3 evaluation/parse_canonical_results.py --results-dir evaluation/results/canonical """ from __future__ import annotations import argparse import json import glob import os import sys from datetime import datetime from pathlib import Path # ─── Metric keys per task ──────────────────────────────────────────────────── TASK_META = { "mmlu": {"label": "MMLU", "metric": "acc,none", "fewshot": 5, "n": 14042}, "mmlu_pro": {"label": "MMLU-Pro", "metric": "acc,none", "fewshot": 5, "n": 4500}, "gsm8k": {"label": "GSM8K", "metric": "exact_match,strict-match", "fewshot": 8, "n": 1319}, "arc_challenge": {"label": "ARC-Challenge", "metric": "acc_norm,none", "fewshot": 25, "n": 1172}, "hellaswag": {"label": "HellaSwag", "metric": "acc_norm,none", "fewshot": 10, "n": 10042}, "truthfulqa_mc2": {"label": "TruthfulQA MC2", "metric": "mc2,none", "fewshot": 0, "n": 817}, "winogrande": {"label": "Winogrande", "metric": "acc,none", "fewshot": 5, "n": 1267}, "ifeval": {"label": "IFEval", "metric": "prompt_level_strict_acc,none", "fewshot": 0, "n": 541}, } CODE_TASKS = { "humaneval": {"label": "HumanEval", "metric": "pass@1", "fewshot": 0, "n": 164}, "mbpp": {"label": "MBPP", "metric": "pass@1", "fewshot": 0, "n": 374}, } def find_latest(pattern: str) -> str | None: files = sorted(glob.glob(pattern)) return files[-1] if files else None def parse_lm_eval(results_dir: str) -> dict: scores = {} for task_key, meta in TASK_META.items(): pattern = f"{results_dir}/{task_key}_*.json" latest = find_latest(pattern) if not latest: continue try: with open(latest) as f: data = json.load(f) res = data.get("results", {}) if task_key in res: raw = res[task_key].get(meta["metric"]) if raw is not None: mtime = os.path.getmtime(latest) scores[task_key] = { "label": meta["label"], "score": round(raw * 100, 2), "metric": meta["metric"], "fewshot": meta["fewshot"], "n": meta["n"], "source": latest, "date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"), "harness": "lm-evaluation-harness 0.4.5", "model": data.get("config", {}).get("model_args", "OdaxAI/DANTE-Mosaic-3.5B"), "dtype": "bfloat16", "device": "NVIDIA A100-40GB", "seed": 42, } except Exception as e: print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr) return scores def parse_code_eval(results_dir: str) -> dict: scores = {} for task_key, meta in CODE_TASKS.items(): # bigcode saves to subdir pattern = f"{results_dir}/{task_key}_*/{task_key}_metrics.json" latest = find_latest(pattern) if not latest: continue try: with open(latest) as f: data = json.load(f) raw = data.get("pass@1") if raw is not None: mtime = os.path.getmtime(latest) scores[task_key] = { "label": meta["label"], "score": round(raw * 100, 2), "metric": "pass@1", "fewshot": 0, "n": meta["n"], "source": latest, "date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"), "harness": "bigcode-evaluation-harness", "model": "OdaxAI/DANTE-Mosaic-3.5B", "dtype": "bfloat16", "device": "NVIDIA A100-40GB", "seed": 0, } except Exception as e: print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr) return scores def print_table(all_scores: dict) -> None: SEP = "=" * 78 print(f"\n{SEP}") print(" CANONICAL BENCHMARK RESULTS — OdaxAI/DANTE-Mosaic-3.5B") print(" All results produced by official evaluation harnesses on Leonardo HPC") print(f"{SEP}") print(f" {'Benchmark':<20} {'N':>6} {'Few-shot':>8} {'Metric':<20} {'Score':>8}") print(" " + "-" * 72) for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]): print(f" {v['label']:<20} {v['n']:>6} {v['fewshot']:>8}-shot " f"{v['metric']:<20} {v['score']:>7.2f}%") print(f"{SEP}\n") def write_summary(all_scores: dict, results_dir: str) -> None: out = { "model": "OdaxAI/DANTE-Mosaic-3.5B", "type": "REAL_CANONICAL_RUN", "harness": "lm-evaluation-harness 0.4.5 + bigcode-evaluation-harness", "hardware": "NVIDIA A100-40GB, BF16", "cluster": "CINECA Leonardo Booster", "seed": 42, "results": all_scores, } path = f"{results_dir}/CANONICAL_SUMMARY.json" with open(path, "w") as f: json.dump(out, f, indent=2) print(f" Summary JSON -> {path}") def write_provenance(all_scores: dict, results_dir: str) -> None: lines = [ "# Canonical Benchmark Provenance — DANTE-Mosaic-3.5B", "", "All results in this table are **REAL_CANONICAL_RUN** — produced by official", "evaluation harnesses on the `OdaxAI/DANTE-Mosaic-3.5B` checkpoint.", "They are directly comparable to published leaderboard scores.", "", "| Benchmark | Harness | Task name | N | Few-shot | Metric | Score | Date | Source JSON |", "|-----------|---------|-----------|---|----------|--------|-------|------|-------------|", ] for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]): src = os.path.basename(v["source"]) lines.append( f"| {v['label']} | {v['harness']} | `{k}` | {v['n']} | " f"{v['fewshot']}-shot | `{v['metric']}` | **{v['score']:.2f}%** | " f"{v['date']} | `{src}` |" ) lines += [ "", "## Hardware & Software", "", "| Property | Value |", "|----------|-------|", "| GPU | NVIDIA A100-SXM-40GB |", "| Precision | BF16 |", "| Cluster | CINECA Leonardo Booster |", "| lm-eval version | 0.4.5 |", "| Seed | 42 |", "", "## Comparability Note", "", "These canonical scores are produced under standard protocols and are directly", "comparable to published scores from the same harness versions.", "Internal offline subset scores (30/40/25 problems from `_benchmark_dante_offline.py`)", "are **separate** and must not be mixed with these canonical results.", ] path = f"{results_dir}/CANONICAL_PROVENANCE.md" with open(path, "w") as f: f.write("\n".join(lines) + "\n") print(f" Provenance -> {path}") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--results-dir", default="evaluation/results/canonical") args = parser.parse_args() rd = args.results_dir print(f"\nParsing results from: {rd}") lm = parse_lm_eval(rd) code = parse_code_eval(rd) all_scores = {**lm, **code} if not all_scores: print("No canonical results found yet.") print("Run evaluation/slurm_lm_eval.slurm and slurm_code_eval.slurm on Leonardo first.") sys.exit(0) print_table(all_scores) write_summary(all_scores, rd) write_provenance(all_scores, rd) print("\nDone.\n") if __name__ == "__main__": main()