#!/bin/bash #SBATCH --job-name=dante_lm_eval #SBATCH --account=AIFAC_F02_254_0 #SBATCH --partition=boost_usr_prod #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --gpus-per-node=1 #SBATCH --cpus-per-task=8 #SBATCH --mem=64G #SBATCH --time=08:00:00 #SBATCH --output=evaluation/slurm_logs/lm_eval_%j.out #SBATCH --error=evaluation/slurm_logs/lm_eval_%j.err # ============================================================================ # DANTE-Mosaic-3.5B — Canonical lm-evaluation-harness benchmark # Leonardo Booster, 1x A100-40GB # # Tasks: MMLU, GSM8K, ARC-Challenge, HellaSwag, TruthfulQA, Winogrande, IFEval # Harness: EleutherAI lm-evaluation-harness 0.4.5 # All outputs -> evaluation/results/canonical/ # ============================================================================ set -euo pipefail # ─── Environment ───────────────────────────────────────────────────────────── module purge module load cuda/12.4 python/3.11.7 export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache" export HF_DATASETS_CACHE="${HF_HOME}/datasets" export TRANSFORMERS_CACHE="${HF_HOME}/transformers" export HF_HUB_CACHE="${HF_HOME}/hub" export TOKENIZERS_PARALLELISM=false # ─── Config ────────────────────────────────────────────────────────────────── MODEL="OdaxAI/DANTE-Mosaic-3.5B" MODEL_ARGS="pretrained=${MODEL},dtype=bfloat16,trust_remote_code=True" BATCH=8 SEED=42 TIMESTAMP=$(date +%Y%m%d_%H%M%S) RESULTS="evaluation/results/canonical" mkdir -p "${RESULTS}" echo "=======================================" echo "DANTE-Mosaic-3.5B — Canonical lm-eval" echo "Model: ${MODEL}" echo "Job ID: ${SLURM_JOB_ID}" echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)" echo "Timestamp: ${TIMESTAMP}" echo "=======================================" # ─── Install harness if needed ──────────────────────────────────────────────── pip install --quiet lm-eval==0.4.5 # ─── Helper: run one task ───────────────────────────────────────────────────── run_task() { local TASK=$1 local FEWSHOT=$2 local EXTRA="${3:-}" local OUT="${RESULTS}/${TASK}_${TIMESTAMP}.json" echo "" echo ">>> ${TASK} (${FEWSHOT}-shot) ..." lm_eval \ --model hf \ --model_args "${MODEL_ARGS}" \ --tasks "${TASK}" \ --num_fewshot "${FEWSHOT}" \ --batch_size "${BATCH}" \ --seed "${SEED}" \ --output_path "${OUT}" \ --log_samples \ ${EXTRA} \ 2>&1 | tee "${RESULTS}/${TASK}_${TIMESTAMP}.log" echo ">>> DONE ${TASK} -> ${OUT}" } # ─── Run all canonical tasks ────────────────────────────────────────────────── # MMLU: 57 subjects, 5-shot, accuracy run_task "mmlu" 5 # MMLU-Pro: harder 10-option MCQ, 5-shot run_task "mmlu_pro" 5 # GSM8K: 8-shot chain-of-thought, exact match on final answer run_task "gsm8k" 8 # ARC-Challenge: 25-shot, normalised accuracy run_task "arc_challenge" 25 # HellaSwag: 10-shot, normalised accuracy run_task "hellaswag" 10 # TruthfulQA MC2: 0-shot, mc2 multiple true answers run_task "truthfulqa_mc2" 0 # Winogrande: 5-shot, accuracy run_task "winogrande" 5 # IFEval: 0-shot, instruction-level strict accuracy run_task "ifeval" 0 # ─── Summary ───────────────────────────────────────────────────────────────── echo "" echo "=======================================" echo "ALL TASKS COMPLETE" echo "Results saved to: ${RESULTS}/" ls -lh "${RESULTS}/" echo "=======================================" # ─── Parse and print summary ───────────────────────────────────────────────── python3 - <<'PYEOF' import json, glob, os, sys results_dir = "evaluation/results/canonical" files = sorted(glob.glob(f"{results_dir}/*.json")) if not files: print("No result JSON files found.") sys.exit(0) # One score per task METRIC_MAP = { "mmlu": ("acc,none", "MMLU"), "mmlu_pro": ("acc,none", "MMLU-Pro"), "gsm8k": ("exact_match,strict-match", "GSM8K"), "arc_challenge": ("acc_norm,none", "ARC-Challenge"), "hellaswag": ("acc_norm,none", "HellaSwag"), "truthfulqa_mc2": ("mc2,none", "TruthfulQA"), "winogrande": ("acc,none", "Winogrande"), "ifeval": ("prompt_level_strict_acc,none", "IFEval"), } print("\n" + "="*60) print(" CANONICAL BENCHMARK RESULTS — DANTE-Mosaic-3.5B") print(" lm-evaluation-harness 0.4.5 | BF16 | A100-40GB | seed=42") print("="*60) print(f" {'Benchmark':<20} {'Metric':<35} {'Score':>8}") print(" " + "-"*58) for f in files: try: with open(f) as fh: data = json.load(fh) results = data.get("results", {}) for task_key, (metric_key, label) in METRIC_MAP.items(): if task_key in results: score = results[task_key].get(metric_key, None) if score is not None: print(f" {label:<20} {metric_key:<35} {score*100:>7.2f}%") except Exception as e: print(f" [parse error: {f}: {e}]") print("="*60) print(f" Source: {results_dir}/") print("="*60 + "\n") PYEOF