#!/bin/bash #SBATCH --job-name=dante_code_eval #SBATCH --account=AIFAC_F02_254_0 #SBATCH --partition=boost_usr_prod #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --gpus-per-node=1 #SBATCH --cpus-per-task=8 #SBATCH --mem=64G #SBATCH --time=04:00:00 #SBATCH --output=evaluation/slurm_logs/code_eval_%j.out #SBATCH --error=evaluation/slurm_logs/code_eval_%j.err # ============================================================================ # DANTE-Mosaic-3.5B — Canonical code evaluation # Leonardo Booster, 1x A100-40GB # # Tasks: HumanEval (pass@1), MBPP (pass@1) # Harness: bigcode-evaluation-harness # All outputs -> evaluation/results/canonical/ # # WARNING: HumanEval executes generated Python code. # bigcode-evaluation-harness uses a sandboxed subprocess per sample. # Do NOT run --allow_code_execution outside of a secure environment. # Leonardo compute nodes are isolated — this is acceptable here. # ============================================================================ set -euo pipefail # ─── Environment ───────────────────────────────────────────────────────────── module purge module load cuda/12.4 python/3.11.7 export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache" export HF_DATASETS_CACHE="${HF_HOME}/datasets" export TRANSFORMERS_CACHE="${HF_HOME}/transformers" export HF_HUB_CACHE="${HF_HOME}/hub" export TOKENIZERS_PARALLELISM=false # ─── Config ────────────────────────────────────────────────────────────────── MODEL="OdaxAI/DANTE-Mosaic-3.5B" TIMESTAMP=$(date +%Y%m%d_%H%M%S) RESULTS="evaluation/results/canonical" BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness" mkdir -p "${RESULTS}" echo "=======================================" echo "DANTE-Mosaic-3.5B — Canonical code eval" echo "Model: ${MODEL}" echo "Job ID: ${SLURM_JOB_ID}" echo "Timestamp: ${TIMESTAMP}" echo "=======================================" # ─── Install bigcode harness if not present ─────────────────────────────────── if [ ! -d "${BIGCODE_DIR}" ]; then echo "Cloning bigcode-evaluation-harness..." git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \ "${BIGCODE_DIR}" pip install --quiet -e "${BIGCODE_DIR}" else echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}" pip install --quiet -e "${BIGCODE_DIR}" fi # ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ──────────────────────── echo "" echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..." HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}" mkdir -p "${HE_OUT}" accelerate launch "${BIGCODE_DIR}/main.py" \ --model "${MODEL}" \ --tasks humaneval \ --do_sample False \ --temperature 0.0 \ --n_samples 1 \ --batch_size 8 \ --allow_code_execution \ --precision bf16 \ --trust_remote_code \ --save_generations \ --save_generations_path "${HE_OUT}/humaneval_generations.json" \ --metric_output_path "${HE_OUT}/humaneval_metrics.json" \ 2>&1 | tee "${HE_OUT}/humaneval.log" echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json" # ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ───────────────────────────── echo "" echo ">>> MBPP pass@1 (374 problems, greedy decoding)..." MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}" mkdir -p "${MBPP_OUT}" accelerate launch "${BIGCODE_DIR}/main.py" \ --model "${MODEL}" \ --tasks mbpp \ --do_sample False \ --temperature 0.0 \ --n_samples 1 \ --batch_size 8 \ --allow_code_execution \ --precision bf16 \ --trust_remote_code \ --save_generations \ --save_generations_path "${MBPP_OUT}/mbpp_generations.json" \ --metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \ 2>&1 | tee "${MBPP_OUT}/mbpp.log" echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json" # ─── Summary ───────────────────────────────────────────────────────────────── echo "" echo "=======================================" echo "CODE EVAL COMPLETE" python3 - <<'PYEOF' import json, glob for label, pat in [ ("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"), ("MBPP", "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"), ]: files = sorted(glob.glob(pat)) if files: with open(files[-1]) as f: d = json.load(f) score = d.get("pass@1", d) print(f" {label}: pass@1 = {score}") else: print(f" {label}: no result file found") PYEOF echo "======================================="