初始化项目,由ModelHub XC社区提供模型

Model: OdaxAI/DANTE-Mosaic-3.5B
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-14 15:44:10 +08:00
commit b0ba87406b
41 changed files with 1638 additions and 0 deletions

View File

@@ -0,0 +1,156 @@
#!/bin/bash
#SBATCH --job-name=dante_lm_eval
#SBATCH --account=AIFAC_F02_254_0
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=64G
#SBATCH --time=08:00:00
#SBATCH --output=evaluation/slurm_logs/lm_eval_%j.out
#SBATCH --error=evaluation/slurm_logs/lm_eval_%j.err
# ============================================================================
# DANTE-Mosaic-3.5B — Canonical lm-evaluation-harness benchmark
# Leonardo Booster, 1x A100-40GB
#
# Tasks: MMLU, GSM8K, ARC-Challenge, HellaSwag, TruthfulQA, Winogrande, IFEval
# Harness: EleutherAI lm-evaluation-harness 0.4.5
# All outputs -> evaluation/results/canonical/
# ============================================================================
set -euo pipefail
# ─── Environment ─────────────────────────────────────────────────────────────
module purge
module load cuda/12.4 python/3.11.7
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
export HF_HUB_CACHE="${HF_HOME}/hub"
export TOKENIZERS_PARALLELISM=false
# ─── Config ──────────────────────────────────────────────────────────────────
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
MODEL_ARGS="pretrained=${MODEL},dtype=bfloat16,trust_remote_code=True"
BATCH=8
SEED=42
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RESULTS="evaluation/results/canonical"
mkdir -p "${RESULTS}"
echo "======================================="
echo "DANTE-Mosaic-3.5B — Canonical lm-eval"
echo "Model: ${MODEL}"
echo "Job ID: ${SLURM_JOB_ID}"
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
echo "Timestamp: ${TIMESTAMP}"
echo "======================================="
# ─── Install harness if needed ────────────────────────────────────────────────
pip install --quiet lm-eval==0.4.5
# ─── Helper: run one task ─────────────────────────────────────────────────────
run_task() {
local TASK=$1
local FEWSHOT=$2
local EXTRA="${3:-}"
local OUT="${RESULTS}/${TASK}_${TIMESTAMP}.json"
echo ""
echo ">>> ${TASK} (${FEWSHOT}-shot) ..."
lm_eval \
--model hf \
--model_args "${MODEL_ARGS}" \
--tasks "${TASK}" \
--num_fewshot "${FEWSHOT}" \
--batch_size "${BATCH}" \
--seed "${SEED}" \
--output_path "${OUT}" \
--log_samples \
${EXTRA} \
2>&1 | tee "${RESULTS}/${TASK}_${TIMESTAMP}.log"
echo ">>> DONE ${TASK} -> ${OUT}"
}
# ─── Run all canonical tasks ──────────────────────────────────────────────────
# MMLU: 57 subjects, 5-shot, accuracy
run_task "mmlu" 5
# MMLU-Pro: harder 10-option MCQ, 5-shot
run_task "mmlu_pro" 5
# GSM8K: 8-shot chain-of-thought, exact match on final answer
run_task "gsm8k" 8
# ARC-Challenge: 25-shot, normalised accuracy
run_task "arc_challenge" 25
# HellaSwag: 10-shot, normalised accuracy
run_task "hellaswag" 10
# TruthfulQA MC2: 0-shot, mc2 multiple true answers
run_task "truthfulqa_mc2" 0
# Winogrande: 5-shot, accuracy
run_task "winogrande" 5
# IFEval: 0-shot, instruction-level strict accuracy
run_task "ifeval" 0
# ─── Summary ─────────────────────────────────────────────────────────────────
echo ""
echo "======================================="
echo "ALL TASKS COMPLETE"
echo "Results saved to: ${RESULTS}/"
ls -lh "${RESULTS}/"
echo "======================================="
# ─── Parse and print summary ─────────────────────────────────────────────────
python3 - <<'PYEOF'
import json, glob, os, sys
results_dir = "evaluation/results/canonical"
files = sorted(glob.glob(f"{results_dir}/*.json"))
if not files:
print("No result JSON files found.")
sys.exit(0)
# One score per task
METRIC_MAP = {
"mmlu": ("acc,none", "MMLU"),
"mmlu_pro": ("acc,none", "MMLU-Pro"),
"gsm8k": ("exact_match,strict-match", "GSM8K"),
"arc_challenge": ("acc_norm,none", "ARC-Challenge"),
"hellaswag": ("acc_norm,none", "HellaSwag"),
"truthfulqa_mc2": ("mc2,none", "TruthfulQA"),
"winogrande": ("acc,none", "Winogrande"),
"ifeval": ("prompt_level_strict_acc,none", "IFEval"),
}
print("\n" + "="*60)
print(" CANONICAL BENCHMARK RESULTS — DANTE-Mosaic-3.5B")
print(" lm-evaluation-harness 0.4.5 | BF16 | A100-40GB | seed=42")
print("="*60)
print(f" {'Benchmark':<20} {'Metric':<35} {'Score':>8}")
print(" " + "-"*58)
for f in files:
try:
with open(f) as fh:
data = json.load(fh)
results = data.get("results", {})
for task_key, (metric_key, label) in METRIC_MAP.items():
if task_key in results:
score = results[task_key].get(metric_key, None)
if score is not None:
print(f" {label:<20} {metric_key:<35} {score*100:>7.2f}%")
except Exception as e:
print(f" [parse error: {f}: {e}]")
print("="*60)
print(f" Source: {results_dir}/")
print("="*60 + "\n")
PYEOF