初始化项目,由ModelHub XC社区提供模型
Model: OdaxAI/DANTE-Mosaic-3.5B Source: Original Platform
This commit is contained in:
156
evaluation/slurm_lm_eval.slurm
Normal file
156
evaluation/slurm_lm_eval.slurm
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=dante_lm_eval
|
||||
#SBATCH --account=AIFAC_F02_254_0
|
||||
#SBATCH --partition=boost_usr_prod
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --gpus-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --time=08:00:00
|
||||
#SBATCH --output=evaluation/slurm_logs/lm_eval_%j.out
|
||||
#SBATCH --error=evaluation/slurm_logs/lm_eval_%j.err
|
||||
|
||||
# ============================================================================
|
||||
# DANTE-Mosaic-3.5B — Canonical lm-evaluation-harness benchmark
|
||||
# Leonardo Booster, 1x A100-40GB
|
||||
#
|
||||
# Tasks: MMLU, GSM8K, ARC-Challenge, HellaSwag, TruthfulQA, Winogrande, IFEval
|
||||
# Harness: EleutherAI lm-evaluation-harness 0.4.5
|
||||
# All outputs -> evaluation/results/canonical/
|
||||
# ============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ─── Environment ─────────────────────────────────────────────────────────────
|
||||
module purge
|
||||
module load cuda/12.4 python/3.11.7
|
||||
|
||||
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
|
||||
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
|
||||
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
|
||||
export HF_HUB_CACHE="${HF_HOME}/hub"
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# ─── Config ──────────────────────────────────────────────────────────────────
|
||||
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
|
||||
MODEL_ARGS="pretrained=${MODEL},dtype=bfloat16,trust_remote_code=True"
|
||||
BATCH=8
|
||||
SEED=42
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
RESULTS="evaluation/results/canonical"
|
||||
mkdir -p "${RESULTS}"
|
||||
|
||||
echo "======================================="
|
||||
echo "DANTE-Mosaic-3.5B — Canonical lm-eval"
|
||||
echo "Model: ${MODEL}"
|
||||
echo "Job ID: ${SLURM_JOB_ID}"
|
||||
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
|
||||
echo "Timestamp: ${TIMESTAMP}"
|
||||
echo "======================================="
|
||||
|
||||
# ─── Install harness if needed ────────────────────────────────────────────────
|
||||
pip install --quiet lm-eval==0.4.5
|
||||
|
||||
# ─── Helper: run one task ─────────────────────────────────────────────────────
|
||||
run_task() {
|
||||
local TASK=$1
|
||||
local FEWSHOT=$2
|
||||
local EXTRA="${3:-}"
|
||||
local OUT="${RESULTS}/${TASK}_${TIMESTAMP}.json"
|
||||
|
||||
echo ""
|
||||
echo ">>> ${TASK} (${FEWSHOT}-shot) ..."
|
||||
lm_eval \
|
||||
--model hf \
|
||||
--model_args "${MODEL_ARGS}" \
|
||||
--tasks "${TASK}" \
|
||||
--num_fewshot "${FEWSHOT}" \
|
||||
--batch_size "${BATCH}" \
|
||||
--seed "${SEED}" \
|
||||
--output_path "${OUT}" \
|
||||
--log_samples \
|
||||
${EXTRA} \
|
||||
2>&1 | tee "${RESULTS}/${TASK}_${TIMESTAMP}.log"
|
||||
echo ">>> DONE ${TASK} -> ${OUT}"
|
||||
}
|
||||
|
||||
# ─── Run all canonical tasks ──────────────────────────────────────────────────
|
||||
# MMLU: 57 subjects, 5-shot, accuracy
|
||||
run_task "mmlu" 5
|
||||
|
||||
# MMLU-Pro: harder 10-option MCQ, 5-shot
|
||||
run_task "mmlu_pro" 5
|
||||
|
||||
# GSM8K: 8-shot chain-of-thought, exact match on final answer
|
||||
run_task "gsm8k" 8
|
||||
|
||||
# ARC-Challenge: 25-shot, normalised accuracy
|
||||
run_task "arc_challenge" 25
|
||||
|
||||
# HellaSwag: 10-shot, normalised accuracy
|
||||
run_task "hellaswag" 10
|
||||
|
||||
# TruthfulQA MC2: 0-shot, mc2 multiple true answers
|
||||
run_task "truthfulqa_mc2" 0
|
||||
|
||||
# Winogrande: 5-shot, accuracy
|
||||
run_task "winogrande" 5
|
||||
|
||||
# IFEval: 0-shot, instruction-level strict accuracy
|
||||
run_task "ifeval" 0
|
||||
|
||||
# ─── Summary ─────────────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo "======================================="
|
||||
echo "ALL TASKS COMPLETE"
|
||||
echo "Results saved to: ${RESULTS}/"
|
||||
ls -lh "${RESULTS}/"
|
||||
echo "======================================="
|
||||
|
||||
# ─── Parse and print summary ─────────────────────────────────────────────────
|
||||
python3 - <<'PYEOF'
|
||||
import json, glob, os, sys
|
||||
|
||||
results_dir = "evaluation/results/canonical"
|
||||
files = sorted(glob.glob(f"{results_dir}/*.json"))
|
||||
if not files:
|
||||
print("No result JSON files found.")
|
||||
sys.exit(0)
|
||||
|
||||
# One score per task
|
||||
METRIC_MAP = {
|
||||
"mmlu": ("acc,none", "MMLU"),
|
||||
"mmlu_pro": ("acc,none", "MMLU-Pro"),
|
||||
"gsm8k": ("exact_match,strict-match", "GSM8K"),
|
||||
"arc_challenge": ("acc_norm,none", "ARC-Challenge"),
|
||||
"hellaswag": ("acc_norm,none", "HellaSwag"),
|
||||
"truthfulqa_mc2": ("mc2,none", "TruthfulQA"),
|
||||
"winogrande": ("acc,none", "Winogrande"),
|
||||
"ifeval": ("prompt_level_strict_acc,none", "IFEval"),
|
||||
}
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(" CANONICAL BENCHMARK RESULTS — DANTE-Mosaic-3.5B")
|
||||
print(" lm-evaluation-harness 0.4.5 | BF16 | A100-40GB | seed=42")
|
||||
print("="*60)
|
||||
print(f" {'Benchmark':<20} {'Metric':<35} {'Score':>8}")
|
||||
print(" " + "-"*58)
|
||||
|
||||
for f in files:
|
||||
try:
|
||||
with open(f) as fh:
|
||||
data = json.load(fh)
|
||||
results = data.get("results", {})
|
||||
for task_key, (metric_key, label) in METRIC_MAP.items():
|
||||
if task_key in results:
|
||||
score = results[task_key].get(metric_key, None)
|
||||
if score is not None:
|
||||
print(f" {label:<20} {metric_key:<35} {score*100:>7.2f}%")
|
||||
except Exception as e:
|
||||
print(f" [parse error: {f}: {e}]")
|
||||
|
||||
print("="*60)
|
||||
print(f" Source: {results_dir}/")
|
||||
print("="*60 + "\n")
|
||||
PYEOF
|
||||
Reference in New Issue
Block a user