DANTE-Mosaic-3.5B/evaluation/parse_canonical_results.py

#!/usr/bin/env python3
"""
parse_canonical_results.py
==========================
Reads all lm-eval and bigcode JSON outputs from evaluation/results/canonical/
and produces:
  1. A clean summary table printed to stdout
  2. evaluation/results/canonical/CANONICAL_SUMMARY.json
  3. evaluation/results/canonical/CANONICAL_PROVENANCE.md

Usage:
    python3 evaluation/parse_canonical_results.py
    python3 evaluation/parse_canonical_results.py --results-dir evaluation/results/canonical
"""

from __future__ import annotations
import argparse
import json
import glob
import os
import sys
from datetime import datetime
from pathlib import Path

# ─── Metric keys per task ────────────────────────────────────────────────────
TASK_META = {
    "mmlu":           {"label": "MMLU",          "metric": "acc,none",                        "fewshot": 5,  "n": 14042},
    "mmlu_pro":       {"label": "MMLU-Pro",       "metric": "acc,none",                        "fewshot": 5,  "n": 4500},
    "gsm8k":          {"label": "GSM8K",          "metric": "exact_match,strict-match",        "fewshot": 8,  "n": 1319},
    "arc_challenge":  {"label": "ARC-Challenge",  "metric": "acc_norm,none",                   "fewshot": 25, "n": 1172},
    "hellaswag":      {"label": "HellaSwag",      "metric": "acc_norm,none",                   "fewshot": 10, "n": 10042},
    "truthfulqa_mc2": {"label": "TruthfulQA MC2", "metric": "mc2,none",                        "fewshot": 0,  "n": 817},
    "winogrande":     {"label": "Winogrande",     "metric": "acc,none",                        "fewshot": 5,  "n": 1267},
    "ifeval":         {"label": "IFEval",         "metric": "prompt_level_strict_acc,none",    "fewshot": 0,  "n": 541},
}

CODE_TASKS = {
    "humaneval": {"label": "HumanEval", "metric": "pass@1", "fewshot": 0, "n": 164},
    "mbpp":      {"label": "MBPP",      "metric": "pass@1", "fewshot": 0, "n": 374},
}


def find_latest(pattern: str) -> str | None:
    files = sorted(glob.glob(pattern))
    return files[-1] if files else None


def parse_lm_eval(results_dir: str) -> dict:
    scores = {}
    for task_key, meta in TASK_META.items():
        pattern = f"{results_dir}/{task_key}_*.json"
        latest = find_latest(pattern)
        if not latest:
            continue
        try:
            with open(latest) as f:
                data = json.load(f)
            res = data.get("results", {})
            if task_key in res:
                raw = res[task_key].get(meta["metric"])
                if raw is not None:
                    mtime = os.path.getmtime(latest)
                    scores[task_key] = {
                        "label":    meta["label"],
                        "score":    round(raw * 100, 2),
                        "metric":   meta["metric"],
                        "fewshot":  meta["fewshot"],
                        "n":        meta["n"],
                        "source":   latest,
                        "date":     datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
                        "harness":  "lm-evaluation-harness 0.4.5",
                        "model":    data.get("config", {}).get("model_args", "OdaxAI/DANTE-Mosaic-3.5B"),
                        "dtype":    "bfloat16",
                        "device":   "NVIDIA A100-40GB",
                        "seed":     42,
                    }
        except Exception as e:
            print(f"  [WARNING] parse error {latest}: {e}", file=sys.stderr)
    return scores


def parse_code_eval(results_dir: str) -> dict:
    scores = {}
    for task_key, meta in CODE_TASKS.items():
        # bigcode saves to subdir
        pattern = f"{results_dir}/{task_key}_*/{task_key}_metrics.json"
        latest = find_latest(pattern)
        if not latest:
            continue
        try:
            with open(latest) as f:
                data = json.load(f)
            raw = data.get("pass@1")
            if raw is not None:
                mtime = os.path.getmtime(latest)
                scores[task_key] = {
                    "label":    meta["label"],
                    "score":    round(raw * 100, 2),
                    "metric":   "pass@1",
                    "fewshot":  0,
                    "n":        meta["n"],
                    "source":   latest,
                    "date":     datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
                    "harness":  "bigcode-evaluation-harness",
                    "model":    "OdaxAI/DANTE-Mosaic-3.5B",
                    "dtype":    "bfloat16",
                    "device":   "NVIDIA A100-40GB",
                    "seed":     0,
                }
        except Exception as e:
            print(f"  [WARNING] parse error {latest}: {e}", file=sys.stderr)
    return scores


def print_table(all_scores: dict) -> None:
    SEP = "=" * 78
    print(f"\n{SEP}")
    print("  CANONICAL BENCHMARK RESULTS — OdaxAI/DANTE-Mosaic-3.5B")
    print("  All results produced by official evaluation harnesses on Leonardo HPC")
    print(f"{SEP}")
    print(f"  {'Benchmark':<20} {'N':>6}  {'Few-shot':>8}  {'Metric':<20} {'Score':>8}")
    print("  " + "-" * 72)
    for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
        print(f"  {v['label']:<20} {v['n']:>6}  {v['fewshot']:>8}-shot  "
              f"{v['metric']:<20} {v['score']:>7.2f}%")
    print(f"{SEP}\n")


def write_summary(all_scores: dict, results_dir: str) -> None:
    out = {
        "model": "OdaxAI/DANTE-Mosaic-3.5B",
        "type": "REAL_CANONICAL_RUN",
        "harness": "lm-evaluation-harness 0.4.5 + bigcode-evaluation-harness",
        "hardware": "NVIDIA A100-40GB, BF16",
        "cluster": "CINECA Leonardo Booster",
        "seed": 42,
        "results": all_scores,
    }
    path = f"{results_dir}/CANONICAL_SUMMARY.json"
    with open(path, "w") as f:
        json.dump(out, f, indent=2)
    print(f"  Summary JSON -> {path}")


def write_provenance(all_scores: dict, results_dir: str) -> None:
    lines = [
        "# Canonical Benchmark Provenance — DANTE-Mosaic-3.5B",
        "",
        "All results in this table are **REAL_CANONICAL_RUN** — produced by official",
        "evaluation harnesses on the `OdaxAI/DANTE-Mosaic-3.5B` checkpoint.",
        "They are directly comparable to published leaderboard scores.",
        "",
        "| Benchmark | Harness | Task name | N | Few-shot | Metric | Score | Date | Source JSON |",
        "|-----------|---------|-----------|---|----------|--------|-------|------|-------------|",
    ]
    for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
        src = os.path.basename(v["source"])
        lines.append(
            f"| {v['label']} | {v['harness']} | `{k}` | {v['n']} | "
            f"{v['fewshot']}-shot | `{v['metric']}` | **{v['score']:.2f}%** | "
            f"{v['date']} | `{src}` |"
        )
    lines += [
        "",
        "## Hardware & Software",
        "",
        "| Property | Value |",
        "|----------|-------|",
        "| GPU | NVIDIA A100-SXM-40GB |",
        "| Precision | BF16 |",
        "| Cluster | CINECA Leonardo Booster |",
        "| lm-eval version | 0.4.5 |",
        "| Seed | 42 |",
        "",
        "## Comparability Note",
        "",
        "These canonical scores are produced under standard protocols and are directly",
        "comparable to published scores from the same harness versions.",
        "Internal offline subset scores (30/40/25 problems from `_benchmark_dante_offline.py`)",
        "are **separate** and must not be mixed with these canonical results.",
    ]
    path = f"{results_dir}/CANONICAL_PROVENANCE.md"
    with open(path, "w") as f:
        f.write("\n".join(lines) + "\n")
    print(f"  Provenance   -> {path}")


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--results-dir", default="evaluation/results/canonical")
    args = parser.parse_args()

    rd = args.results_dir
    print(f"\nParsing results from: {rd}")

    lm = parse_lm_eval(rd)
    code = parse_code_eval(rd)
    all_scores = {**lm, **code}

    if not all_scores:
        print("No canonical results found yet.")
        print("Run evaluation/slurm_lm_eval.slurm and slurm_code_eval.slurm on Leonardo first.")
        sys.exit(0)

    print_table(all_scores)
    write_summary(all_scores, rd)
    write_provenance(all_scores, rd)
    print("\nDone.\n")


if __name__ == "__main__":
    main()