初始化项目，由ModelHub XC社区提供模型

Model: OdaxAI/DANTE-Mosaic-3.5B Source: Original Platform
2026-05-14 15:44:10 +08:00
commit b0ba87406b
41 changed files with 1638 additions and 0 deletions
--- a/evaluation/parse_canonical_results.py
+++ b/evaluation/parse_canonical_results.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""
+parse_canonical_results.py
+==========================
+Reads all lm-eval and bigcode JSON outputs from evaluation/results/canonical/
+and produces:
+  1. A clean summary table printed to stdout
+  2. evaluation/results/canonical/CANONICAL_SUMMARY.json
+  3. evaluation/results/canonical/CANONICAL_PROVENANCE.md
+
+Usage:
+    python3 evaluation/parse_canonical_results.py
+    python3 evaluation/parse_canonical_results.py --results-dir evaluation/results/canonical
+"""
+
+from __future__ import annotations
+import argparse
+import json
+import glob
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# ─── Metric keys per task ────────────────────────────────────────────────────
+TASK_META = {
+    "mmlu":           {"label": "MMLU",          "metric": "acc,none",                        "fewshot": 5,  "n": 14042},
+    "mmlu_pro":       {"label": "MMLU-Pro",       "metric": "acc,none",                        "fewshot": 5,  "n": 4500},
+    "gsm8k":          {"label": "GSM8K",          "metric": "exact_match,strict-match",        "fewshot": 8,  "n": 1319},
+    "arc_challenge":  {"label": "ARC-Challenge",  "metric": "acc_norm,none",                   "fewshot": 25, "n": 1172},
+    "hellaswag":      {"label": "HellaSwag",      "metric": "acc_norm,none",                   "fewshot": 10, "n": 10042},
+    "truthfulqa_mc2": {"label": "TruthfulQA MC2", "metric": "mc2,none",                        "fewshot": 0,  "n": 817},
+    "winogrande":     {"label": "Winogrande",     "metric": "acc,none",                        "fewshot": 5,  "n": 1267},
+    "ifeval":         {"label": "IFEval",         "metric": "prompt_level_strict_acc,none",    "fewshot": 0,  "n": 541},
+}
+
+CODE_TASKS = {
+    "humaneval": {"label": "HumanEval", "metric": "pass@1", "fewshot": 0, "n": 164},
+    "mbpp":      {"label": "MBPP",      "metric": "pass@1", "fewshot": 0, "n": 374},
+}
+
+
+def find_latest(pattern: str) -> str | None:
+    files = sorted(glob.glob(pattern))
+    return files[-1] if files else None
+
+
+def parse_lm_eval(results_dir: str) -> dict:
+    scores = {}
+    for task_key, meta in TASK_META.items():
+        pattern = f"{results_dir}/{task_key}_*.json"
+        latest = find_latest(pattern)
+        if not latest:
+            continue
+        try:
+            with open(latest) as f:
+                data = json.load(f)
+            res = data.get("results", {})
+            if task_key in res:
+                raw = res[task_key].get(meta["metric"])
+                if raw is not None:
+                    mtime = os.path.getmtime(latest)
+                    scores[task_key] = {
+                        "label":    meta["label"],
+                        "score":    round(raw * 100, 2),
+                        "metric":   meta["metric"],
+                        "fewshot":  meta["fewshot"],
+                        "n":        meta["n"],
+                        "source":   latest,
+                        "date":     datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
+                        "harness":  "lm-evaluation-harness 0.4.5",
+                        "model":    data.get("config", {}).get("model_args", "OdaxAI/DANTE-Mosaic-3.5B"),
+                        "dtype":    "bfloat16",
+                        "device":   "NVIDIA A100-40GB",
+                        "seed":     42,
+                    }
+        except Exception as e:
+            print(f"  [WARNING] parse error {latest}: {e}", file=sys.stderr)
+    return scores
+
+
+def parse_code_eval(results_dir: str) -> dict:
+    scores = {}
+    for task_key, meta in CODE_TASKS.items():
+        # bigcode saves to subdir
+        pattern = f"{results_dir}/{task_key}_*/{task_key}_metrics.json"
+        latest = find_latest(pattern)
+        if not latest:
+            continue
+        try:
+            with open(latest) as f:
+                data = json.load(f)
+            raw = data.get("pass@1")
+            if raw is not None:
+                mtime = os.path.getmtime(latest)
+                scores[task_key] = {
+                    "label":    meta["label"],
+                    "score":    round(raw * 100, 2),
+                    "metric":   "pass@1",
+                    "fewshot":  0,
+                    "n":        meta["n"],
+                    "source":   latest,
+                    "date":     datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
+                    "harness":  "bigcode-evaluation-harness",
+                    "model":    "OdaxAI/DANTE-Mosaic-3.5B",
+                    "dtype":    "bfloat16",
+                    "device":   "NVIDIA A100-40GB",
+                    "seed":     0,
+                }
+        except Exception as e:
+            print(f"  [WARNING] parse error {latest}: {e}", file=sys.stderr)
+    return scores
+
+
+def print_table(all_scores: dict) -> None:
+    SEP = "=" * 78
+    print(f"\n{SEP}")
+    print("  CANONICAL BENCHMARK RESULTS — OdaxAI/DANTE-Mosaic-3.5B")
+    print("  All results produced by official evaluation harnesses on Leonardo HPC")
+    print(f"{SEP}")
+    print(f"  {'Benchmark':<20} {'N':>6}  {'Few-shot':>8}  {'Metric':<20} {'Score':>8}")
+    print("  " + "-" * 72)
+    for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
+        print(f"  {v['label']:<20} {v['n']:>6}  {v['fewshot']:>8}-shot  "
+              f"{v['metric']:<20} {v['score']:>7.2f}%")
+    print(f"{SEP}\n")
+
+
+def write_summary(all_scores: dict, results_dir: str) -> None:
+    out = {
+        "model": "OdaxAI/DANTE-Mosaic-3.5B",
+        "type": "REAL_CANONICAL_RUN",
+        "harness": "lm-evaluation-harness 0.4.5 + bigcode-evaluation-harness",
+        "hardware": "NVIDIA A100-40GB, BF16",
+        "cluster": "CINECA Leonardo Booster",
+        "seed": 42,
+        "results": all_scores,
+    }
+    path = f"{results_dir}/CANONICAL_SUMMARY.json"
+    with open(path, "w") as f:
+        json.dump(out, f, indent=2)
+    print(f"  Summary JSON -> {path}")
+
+
+def write_provenance(all_scores: dict, results_dir: str) -> None:
+    lines = [
+        "# Canonical Benchmark Provenance — DANTE-Mosaic-3.5B",
+        "",
+        "All results in this table are **REAL_CANONICAL_RUN** — produced by official",
+        "evaluation harnesses on the `OdaxAI/DANTE-Mosaic-3.5B` checkpoint.",
+        "They are directly comparable to published leaderboard scores.",
+        "",
+        "| Benchmark | Harness | Task name | N | Few-shot | Metric | Score | Date | Source JSON |",
+        "|-----------|---------|-----------|---|----------|--------|-------|------|-------------|",
+    ]
+    for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
+        src = os.path.basename(v["source"])
+        lines.append(
+            f"| {v['label']} | {v['harness']} | `{k}` | {v['n']} | "
+            f"{v['fewshot']}-shot | `{v['metric']}` | **{v['score']:.2f}%** | "
+            f"{v['date']} | `{src}` |"
+        )
+    lines += [
+        "",
+        "## Hardware & Software",
+        "",
+        "| Property | Value |",
+        "|----------|-------|",
+        "| GPU | NVIDIA A100-SXM-40GB |",
+        "| Precision | BF16 |",
+        "| Cluster | CINECA Leonardo Booster |",
+        "| lm-eval version | 0.4.5 |",
+        "| Seed | 42 |",
+        "",
+        "## Comparability Note",
+        "",
+        "These canonical scores are produced under standard protocols and are directly",
+        "comparable to published scores from the same harness versions.",
+        "Internal offline subset scores (30/40/25 problems from `_benchmark_dante_offline.py`)",
+        "are **separate** and must not be mixed with these canonical results.",
+    ]
+    path = f"{results_dir}/CANONICAL_PROVENANCE.md"
+    with open(path, "w") as f:
+        f.write("\n".join(lines) + "\n")
+    print(f"  Provenance   -> {path}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results-dir", default="evaluation/results/canonical")
+    args = parser.parse_args()
+
+    rd = args.results_dir
+    print(f"\nParsing results from: {rd}")
+
+    lm = parse_lm_eval(rd)
+    code = parse_code_eval(rd)
+    all_scores = {**lm, **code}
+
+    if not all_scores:
+        print("No canonical results found yet.")
+        print("Run evaluation/slurm_lm_eval.slurm and slurm_code_eval.slurm on Leonardo first.")
+        sys.exit(0)
+
+    print_table(all_scores)
+    write_summary(all_scores, rd)
+    write_provenance(all_scores, rd)
+    print("\nDone.\n")
+
+
+if __name__ == "__main__":
+    main()