Files
DANTE-Mosaic-3.5B/evaluation/parse_canonical_results.py
ModelHub XC b0ba87406b 初始化项目,由ModelHub XC社区提供模型
Model: OdaxAI/DANTE-Mosaic-3.5B
Source: Original Platform
2026-05-14 15:44:10 +08:00

213 lines
8.3 KiB
Python

#!/usr/bin/env python3
"""
parse_canonical_results.py
==========================
Reads all lm-eval and bigcode JSON outputs from evaluation/results/canonical/
and produces:
1. A clean summary table printed to stdout
2. evaluation/results/canonical/CANONICAL_SUMMARY.json
3. evaluation/results/canonical/CANONICAL_PROVENANCE.md
Usage:
python3 evaluation/parse_canonical_results.py
python3 evaluation/parse_canonical_results.py --results-dir evaluation/results/canonical
"""
from __future__ import annotations
import argparse
import json
import glob
import os
import sys
from datetime import datetime
from pathlib import Path
# ─── Metric keys per task ────────────────────────────────────────────────────
TASK_META = {
"mmlu": {"label": "MMLU", "metric": "acc,none", "fewshot": 5, "n": 14042},
"mmlu_pro": {"label": "MMLU-Pro", "metric": "acc,none", "fewshot": 5, "n": 4500},
"gsm8k": {"label": "GSM8K", "metric": "exact_match,strict-match", "fewshot": 8, "n": 1319},
"arc_challenge": {"label": "ARC-Challenge", "metric": "acc_norm,none", "fewshot": 25, "n": 1172},
"hellaswag": {"label": "HellaSwag", "metric": "acc_norm,none", "fewshot": 10, "n": 10042},
"truthfulqa_mc2": {"label": "TruthfulQA MC2", "metric": "mc2,none", "fewshot": 0, "n": 817},
"winogrande": {"label": "Winogrande", "metric": "acc,none", "fewshot": 5, "n": 1267},
"ifeval": {"label": "IFEval", "metric": "prompt_level_strict_acc,none", "fewshot": 0, "n": 541},
}
CODE_TASKS = {
"humaneval": {"label": "HumanEval", "metric": "pass@1", "fewshot": 0, "n": 164},
"mbpp": {"label": "MBPP", "metric": "pass@1", "fewshot": 0, "n": 374},
}
def find_latest(pattern: str) -> str | None:
files = sorted(glob.glob(pattern))
return files[-1] if files else None
def parse_lm_eval(results_dir: str) -> dict:
scores = {}
for task_key, meta in TASK_META.items():
pattern = f"{results_dir}/{task_key}_*.json"
latest = find_latest(pattern)
if not latest:
continue
try:
with open(latest) as f:
data = json.load(f)
res = data.get("results", {})
if task_key in res:
raw = res[task_key].get(meta["metric"])
if raw is not None:
mtime = os.path.getmtime(latest)
scores[task_key] = {
"label": meta["label"],
"score": round(raw * 100, 2),
"metric": meta["metric"],
"fewshot": meta["fewshot"],
"n": meta["n"],
"source": latest,
"date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
"harness": "lm-evaluation-harness 0.4.5",
"model": data.get("config", {}).get("model_args", "OdaxAI/DANTE-Mosaic-3.5B"),
"dtype": "bfloat16",
"device": "NVIDIA A100-40GB",
"seed": 42,
}
except Exception as e:
print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr)
return scores
def parse_code_eval(results_dir: str) -> dict:
scores = {}
for task_key, meta in CODE_TASKS.items():
# bigcode saves to subdir
pattern = f"{results_dir}/{task_key}_*/{task_key}_metrics.json"
latest = find_latest(pattern)
if not latest:
continue
try:
with open(latest) as f:
data = json.load(f)
raw = data.get("pass@1")
if raw is not None:
mtime = os.path.getmtime(latest)
scores[task_key] = {
"label": meta["label"],
"score": round(raw * 100, 2),
"metric": "pass@1",
"fewshot": 0,
"n": meta["n"],
"source": latest,
"date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
"harness": "bigcode-evaluation-harness",
"model": "OdaxAI/DANTE-Mosaic-3.5B",
"dtype": "bfloat16",
"device": "NVIDIA A100-40GB",
"seed": 0,
}
except Exception as e:
print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr)
return scores
def print_table(all_scores: dict) -> None:
SEP = "=" * 78
print(f"\n{SEP}")
print(" CANONICAL BENCHMARK RESULTS — OdaxAI/DANTE-Mosaic-3.5B")
print(" All results produced by official evaluation harnesses on Leonardo HPC")
print(f"{SEP}")
print(f" {'Benchmark':<20} {'N':>6} {'Few-shot':>8} {'Metric':<20} {'Score':>8}")
print(" " + "-" * 72)
for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
print(f" {v['label']:<20} {v['n']:>6} {v['fewshot']:>8}-shot "
f"{v['metric']:<20} {v['score']:>7.2f}%")
print(f"{SEP}\n")
def write_summary(all_scores: dict, results_dir: str) -> None:
out = {
"model": "OdaxAI/DANTE-Mosaic-3.5B",
"type": "REAL_CANONICAL_RUN",
"harness": "lm-evaluation-harness 0.4.5 + bigcode-evaluation-harness",
"hardware": "NVIDIA A100-40GB, BF16",
"cluster": "CINECA Leonardo Booster",
"seed": 42,
"results": all_scores,
}
path = f"{results_dir}/CANONICAL_SUMMARY.json"
with open(path, "w") as f:
json.dump(out, f, indent=2)
print(f" Summary JSON -> {path}")
def write_provenance(all_scores: dict, results_dir: str) -> None:
lines = [
"# Canonical Benchmark Provenance — DANTE-Mosaic-3.5B",
"",
"All results in this table are **REAL_CANONICAL_RUN** — produced by official",
"evaluation harnesses on the `OdaxAI/DANTE-Mosaic-3.5B` checkpoint.",
"They are directly comparable to published leaderboard scores.",
"",
"| Benchmark | Harness | Task name | N | Few-shot | Metric | Score | Date | Source JSON |",
"|-----------|---------|-----------|---|----------|--------|-------|------|-------------|",
]
for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
src = os.path.basename(v["source"])
lines.append(
f"| {v['label']} | {v['harness']} | `{k}` | {v['n']} | "
f"{v['fewshot']}-shot | `{v['metric']}` | **{v['score']:.2f}%** | "
f"{v['date']} | `{src}` |"
)
lines += [
"",
"## Hardware & Software",
"",
"| Property | Value |",
"|----------|-------|",
"| GPU | NVIDIA A100-SXM-40GB |",
"| Precision | BF16 |",
"| Cluster | CINECA Leonardo Booster |",
"| lm-eval version | 0.4.5 |",
"| Seed | 42 |",
"",
"## Comparability Note",
"",
"These canonical scores are produced under standard protocols and are directly",
"comparable to published scores from the same harness versions.",
"Internal offline subset scores (30/40/25 problems from `_benchmark_dante_offline.py`)",
"are **separate** and must not be mixed with these canonical results.",
]
path = f"{results_dir}/CANONICAL_PROVENANCE.md"
with open(path, "w") as f:
f.write("\n".join(lines) + "\n")
print(f" Provenance -> {path}")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--results-dir", default="evaluation/results/canonical")
args = parser.parse_args()
rd = args.results_dir
print(f"\nParsing results from: {rd}")
lm = parse_lm_eval(rd)
code = parse_code_eval(rd)
all_scores = {**lm, **code}
if not all_scores:
print("No canonical results found yet.")
print("Run evaluation/slurm_lm_eval.slurm and slurm_code_eval.slurm on Leonardo first.")
sys.exit(0)
print_table(all_scores)
write_summary(all_scores, rd)
write_provenance(all_scores, rd)
print("\nDone.\n")
if __name__ == "__main__":
main()