157 lines
5.8 KiB
Bash
157 lines
5.8 KiB
Bash
#!/bin/bash
|
|
#SBATCH --job-name=dante_lm_eval
|
|
#SBATCH --account=AIFAC_F02_254_0
|
|
#SBATCH --partition=boost_usr_prod
|
|
#SBATCH --nodes=1
|
|
#SBATCH --ntasks-per-node=1
|
|
#SBATCH --gpus-per-node=1
|
|
#SBATCH --cpus-per-task=8
|
|
#SBATCH --mem=64G
|
|
#SBATCH --time=08:00:00
|
|
#SBATCH --output=evaluation/slurm_logs/lm_eval_%j.out
|
|
#SBATCH --error=evaluation/slurm_logs/lm_eval_%j.err
|
|
|
|
# ============================================================================
|
|
# DANTE-Mosaic-3.5B — Canonical lm-evaluation-harness benchmark
|
|
# Leonardo Booster, 1x A100-40GB
|
|
#
|
|
# Tasks: MMLU, GSM8K, ARC-Challenge, HellaSwag, TruthfulQA, Winogrande, IFEval
|
|
# Harness: EleutherAI lm-evaluation-harness 0.4.5
|
|
# All outputs -> evaluation/results/canonical/
|
|
# ============================================================================
|
|
|
|
set -euo pipefail
|
|
|
|
# ─── Environment ─────────────────────────────────────────────────────────────
|
|
module purge
|
|
module load cuda/12.4 python/3.11.7
|
|
|
|
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
|
|
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
|
|
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
|
|
export HF_HUB_CACHE="${HF_HOME}/hub"
|
|
export TOKENIZERS_PARALLELISM=false
|
|
|
|
# ─── Config ──────────────────────────────────────────────────────────────────
|
|
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
|
|
MODEL_ARGS="pretrained=${MODEL},dtype=bfloat16,trust_remote_code=True"
|
|
BATCH=8
|
|
SEED=42
|
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
|
RESULTS="evaluation/results/canonical"
|
|
mkdir -p "${RESULTS}"
|
|
|
|
echo "======================================="
|
|
echo "DANTE-Mosaic-3.5B — Canonical lm-eval"
|
|
echo "Model: ${MODEL}"
|
|
echo "Job ID: ${SLURM_JOB_ID}"
|
|
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
|
|
echo "Timestamp: ${TIMESTAMP}"
|
|
echo "======================================="
|
|
|
|
# ─── Install harness if needed ────────────────────────────────────────────────
|
|
pip install --quiet lm-eval==0.4.5
|
|
|
|
# ─── Helper: run one task ─────────────────────────────────────────────────────
|
|
run_task() {
|
|
local TASK=$1
|
|
local FEWSHOT=$2
|
|
local EXTRA="${3:-}"
|
|
local OUT="${RESULTS}/${TASK}_${TIMESTAMP}.json"
|
|
|
|
echo ""
|
|
echo ">>> ${TASK} (${FEWSHOT}-shot) ..."
|
|
lm_eval \
|
|
--model hf \
|
|
--model_args "${MODEL_ARGS}" \
|
|
--tasks "${TASK}" \
|
|
--num_fewshot "${FEWSHOT}" \
|
|
--batch_size "${BATCH}" \
|
|
--seed "${SEED}" \
|
|
--output_path "${OUT}" \
|
|
--log_samples \
|
|
${EXTRA} \
|
|
2>&1 | tee "${RESULTS}/${TASK}_${TIMESTAMP}.log"
|
|
echo ">>> DONE ${TASK} -> ${OUT}"
|
|
}
|
|
|
|
# ─── Run all canonical tasks ──────────────────────────────────────────────────
|
|
# MMLU: 57 subjects, 5-shot, accuracy
|
|
run_task "mmlu" 5
|
|
|
|
# MMLU-Pro: harder 10-option MCQ, 5-shot
|
|
run_task "mmlu_pro" 5
|
|
|
|
# GSM8K: 8-shot chain-of-thought, exact match on final answer
|
|
run_task "gsm8k" 8
|
|
|
|
# ARC-Challenge: 25-shot, normalised accuracy
|
|
run_task "arc_challenge" 25
|
|
|
|
# HellaSwag: 10-shot, normalised accuracy
|
|
run_task "hellaswag" 10
|
|
|
|
# TruthfulQA MC2: 0-shot, mc2 multiple true answers
|
|
run_task "truthfulqa_mc2" 0
|
|
|
|
# Winogrande: 5-shot, accuracy
|
|
run_task "winogrande" 5
|
|
|
|
# IFEval: 0-shot, instruction-level strict accuracy
|
|
run_task "ifeval" 0
|
|
|
|
# ─── Summary ─────────────────────────────────────────────────────────────────
|
|
echo ""
|
|
echo "======================================="
|
|
echo "ALL TASKS COMPLETE"
|
|
echo "Results saved to: ${RESULTS}/"
|
|
ls -lh "${RESULTS}/"
|
|
echo "======================================="
|
|
|
|
# ─── Parse and print summary ─────────────────────────────────────────────────
|
|
python3 - <<'PYEOF'
|
|
import json, glob, os, sys
|
|
|
|
results_dir = "evaluation/results/canonical"
|
|
files = sorted(glob.glob(f"{results_dir}/*.json"))
|
|
if not files:
|
|
print("No result JSON files found.")
|
|
sys.exit(0)
|
|
|
|
# One score per task
|
|
METRIC_MAP = {
|
|
"mmlu": ("acc,none", "MMLU"),
|
|
"mmlu_pro": ("acc,none", "MMLU-Pro"),
|
|
"gsm8k": ("exact_match,strict-match", "GSM8K"),
|
|
"arc_challenge": ("acc_norm,none", "ARC-Challenge"),
|
|
"hellaswag": ("acc_norm,none", "HellaSwag"),
|
|
"truthfulqa_mc2": ("mc2,none", "TruthfulQA"),
|
|
"winogrande": ("acc,none", "Winogrande"),
|
|
"ifeval": ("prompt_level_strict_acc,none", "IFEval"),
|
|
}
|
|
|
|
print("\n" + "="*60)
|
|
print(" CANONICAL BENCHMARK RESULTS — DANTE-Mosaic-3.5B")
|
|
print(" lm-evaluation-harness 0.4.5 | BF16 | A100-40GB | seed=42")
|
|
print("="*60)
|
|
print(f" {'Benchmark':<20} {'Metric':<35} {'Score':>8}")
|
|
print(" " + "-"*58)
|
|
|
|
for f in files:
|
|
try:
|
|
with open(f) as fh:
|
|
data = json.load(fh)
|
|
results = data.get("results", {})
|
|
for task_key, (metric_key, label) in METRIC_MAP.items():
|
|
if task_key in results:
|
|
score = results[task_key].get(metric_key, None)
|
|
if score is not None:
|
|
print(f" {label:<20} {metric_key:<35} {score*100:>7.2f}%")
|
|
except Exception as e:
|
|
print(f" [parse error: {f}: {e}]")
|
|
|
|
print("="*60)
|
|
print(f" Source: {results_dir}/")
|
|
print("="*60 + "\n")
|
|
PYEOF
|