Files
DANTE-Mosaic-3.5B/evaluation/slurm_code_eval.slurm

132 lines
5.0 KiB
Plaintext
Raw Normal View History

#!/bin/bash
#SBATCH --job-name=dante_code_eval
#SBATCH --account=AIFAC_F02_254_0
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=64G
#SBATCH --time=04:00:00
#SBATCH --output=evaluation/slurm_logs/code_eval_%j.out
#SBATCH --error=evaluation/slurm_logs/code_eval_%j.err
# ============================================================================
# DANTE-Mosaic-3.5B — Canonical code evaluation
# Leonardo Booster, 1x A100-40GB
#
# Tasks: HumanEval (pass@1), MBPP (pass@1)
# Harness: bigcode-evaluation-harness
# All outputs -> evaluation/results/canonical/
#
# WARNING: HumanEval executes generated Python code.
# bigcode-evaluation-harness uses a sandboxed subprocess per sample.
# Do NOT run --allow_code_execution outside of a secure environment.
# Leonardo compute nodes are isolated — this is acceptable here.
# ============================================================================
set -euo pipefail
# ─── Environment ─────────────────────────────────────────────────────────────
module purge
module load cuda/12.4 python/3.11.7
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
export HF_HUB_CACHE="${HF_HOME}/hub"
export TOKENIZERS_PARALLELISM=false
# ─── Config ──────────────────────────────────────────────────────────────────
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RESULTS="evaluation/results/canonical"
BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness"
mkdir -p "${RESULTS}"
echo "======================================="
echo "DANTE-Mosaic-3.5B — Canonical code eval"
echo "Model: ${MODEL}"
echo "Job ID: ${SLURM_JOB_ID}"
echo "Timestamp: ${TIMESTAMP}"
echo "======================================="
# ─── Install bigcode harness if not present ───────────────────────────────────
if [ ! -d "${BIGCODE_DIR}" ]; then
echo "Cloning bigcode-evaluation-harness..."
git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \
"${BIGCODE_DIR}"
pip install --quiet -e "${BIGCODE_DIR}"
else
echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}"
pip install --quiet -e "${BIGCODE_DIR}"
fi
# ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ────────────────────────
echo ""
echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..."
HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}"
mkdir -p "${HE_OUT}"
accelerate launch "${BIGCODE_DIR}/main.py" \
--model "${MODEL}" \
--tasks humaneval \
--do_sample False \
--temperature 0.0 \
--n_samples 1 \
--batch_size 8 \
--allow_code_execution \
--precision bf16 \
--trust_remote_code \
--save_generations \
--save_generations_path "${HE_OUT}/humaneval_generations.json" \
--metric_output_path "${HE_OUT}/humaneval_metrics.json" \
2>&1 | tee "${HE_OUT}/humaneval.log"
echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json"
# ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ─────────────────────────────
echo ""
echo ">>> MBPP pass@1 (374 problems, greedy decoding)..."
MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}"
mkdir -p "${MBPP_OUT}"
accelerate launch "${BIGCODE_DIR}/main.py" \
--model "${MODEL}" \
--tasks mbpp \
--do_sample False \
--temperature 0.0 \
--n_samples 1 \
--batch_size 8 \
--allow_code_execution \
--precision bf16 \
--trust_remote_code \
--save_generations \
--save_generations_path "${MBPP_OUT}/mbpp_generations.json" \
--metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \
2>&1 | tee "${MBPP_OUT}/mbpp.log"
echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json"
# ─── Summary ─────────────────────────────────────────────────────────────────
echo ""
echo "======================================="
echo "CODE EVAL COMPLETE"
python3 - <<'PYEOF'
import json, glob
for label, pat in [
("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"),
("MBPP", "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"),
]:
files = sorted(glob.glob(pat))
if files:
with open(files[-1]) as f:
d = json.load(f)
score = d.get("pass@1", d)
print(f" {label}: pass@1 = {score}")
else:
print(f" {label}: no result file found")
PYEOF
echo "======================================="