132 lines
5.0 KiB
Plaintext
132 lines
5.0 KiB
Plaintext
|
|
#!/bin/bash
|
||
|
|
#SBATCH --job-name=dante_code_eval
|
||
|
|
#SBATCH --account=AIFAC_F02_254_0
|
||
|
|
#SBATCH --partition=boost_usr_prod
|
||
|
|
#SBATCH --nodes=1
|
||
|
|
#SBATCH --ntasks-per-node=1
|
||
|
|
#SBATCH --gpus-per-node=1
|
||
|
|
#SBATCH --cpus-per-task=8
|
||
|
|
#SBATCH --mem=64G
|
||
|
|
#SBATCH --time=04:00:00
|
||
|
|
#SBATCH --output=evaluation/slurm_logs/code_eval_%j.out
|
||
|
|
#SBATCH --error=evaluation/slurm_logs/code_eval_%j.err
|
||
|
|
|
||
|
|
# ============================================================================
|
||
|
|
# DANTE-Mosaic-3.5B — Canonical code evaluation
|
||
|
|
# Leonardo Booster, 1x A100-40GB
|
||
|
|
#
|
||
|
|
# Tasks: HumanEval (pass@1), MBPP (pass@1)
|
||
|
|
# Harness: bigcode-evaluation-harness
|
||
|
|
# All outputs -> evaluation/results/canonical/
|
||
|
|
#
|
||
|
|
# WARNING: HumanEval executes generated Python code.
|
||
|
|
# bigcode-evaluation-harness uses a sandboxed subprocess per sample.
|
||
|
|
# Do NOT run --allow_code_execution outside of a secure environment.
|
||
|
|
# Leonardo compute nodes are isolated — this is acceptable here.
|
||
|
|
# ============================================================================
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
# ─── Environment ─────────────────────────────────────────────────────────────
|
||
|
|
module purge
|
||
|
|
module load cuda/12.4 python/3.11.7
|
||
|
|
|
||
|
|
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
|
||
|
|
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
|
||
|
|
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
|
||
|
|
export HF_HUB_CACHE="${HF_HOME}/hub"
|
||
|
|
export TOKENIZERS_PARALLELISM=false
|
||
|
|
|
||
|
|
# ─── Config ──────────────────────────────────────────────────────────────────
|
||
|
|
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
|
||
|
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||
|
|
RESULTS="evaluation/results/canonical"
|
||
|
|
BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness"
|
||
|
|
mkdir -p "${RESULTS}"
|
||
|
|
|
||
|
|
echo "======================================="
|
||
|
|
echo "DANTE-Mosaic-3.5B — Canonical code eval"
|
||
|
|
echo "Model: ${MODEL}"
|
||
|
|
echo "Job ID: ${SLURM_JOB_ID}"
|
||
|
|
echo "Timestamp: ${TIMESTAMP}"
|
||
|
|
echo "======================================="
|
||
|
|
|
||
|
|
# ─── Install bigcode harness if not present ───────────────────────────────────
|
||
|
|
if [ ! -d "${BIGCODE_DIR}" ]; then
|
||
|
|
echo "Cloning bigcode-evaluation-harness..."
|
||
|
|
git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \
|
||
|
|
"${BIGCODE_DIR}"
|
||
|
|
pip install --quiet -e "${BIGCODE_DIR}"
|
||
|
|
else
|
||
|
|
echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}"
|
||
|
|
pip install --quiet -e "${BIGCODE_DIR}"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ────────────────────────
|
||
|
|
echo ""
|
||
|
|
echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..."
|
||
|
|
HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}"
|
||
|
|
mkdir -p "${HE_OUT}"
|
||
|
|
|
||
|
|
accelerate launch "${BIGCODE_DIR}/main.py" \
|
||
|
|
--model "${MODEL}" \
|
||
|
|
--tasks humaneval \
|
||
|
|
--do_sample False \
|
||
|
|
--temperature 0.0 \
|
||
|
|
--n_samples 1 \
|
||
|
|
--batch_size 8 \
|
||
|
|
--allow_code_execution \
|
||
|
|
--precision bf16 \
|
||
|
|
--trust_remote_code \
|
||
|
|
--save_generations \
|
||
|
|
--save_generations_path "${HE_OUT}/humaneval_generations.json" \
|
||
|
|
--metric_output_path "${HE_OUT}/humaneval_metrics.json" \
|
||
|
|
2>&1 | tee "${HE_OUT}/humaneval.log"
|
||
|
|
|
||
|
|
echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json"
|
||
|
|
|
||
|
|
# ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ─────────────────────────────
|
||
|
|
echo ""
|
||
|
|
echo ">>> MBPP pass@1 (374 problems, greedy decoding)..."
|
||
|
|
MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}"
|
||
|
|
mkdir -p "${MBPP_OUT}"
|
||
|
|
|
||
|
|
accelerate launch "${BIGCODE_DIR}/main.py" \
|
||
|
|
--model "${MODEL}" \
|
||
|
|
--tasks mbpp \
|
||
|
|
--do_sample False \
|
||
|
|
--temperature 0.0 \
|
||
|
|
--n_samples 1 \
|
||
|
|
--batch_size 8 \
|
||
|
|
--allow_code_execution \
|
||
|
|
--precision bf16 \
|
||
|
|
--trust_remote_code \
|
||
|
|
--save_generations \
|
||
|
|
--save_generations_path "${MBPP_OUT}/mbpp_generations.json" \
|
||
|
|
--metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \
|
||
|
|
2>&1 | tee "${MBPP_OUT}/mbpp.log"
|
||
|
|
|
||
|
|
echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json"
|
||
|
|
|
||
|
|
# ─── Summary ─────────────────────────────────────────────────────────────────
|
||
|
|
echo ""
|
||
|
|
echo "======================================="
|
||
|
|
echo "CODE EVAL COMPLETE"
|
||
|
|
python3 - <<'PYEOF'
|
||
|
|
import json, glob
|
||
|
|
|
||
|
|
for label, pat in [
|
||
|
|
("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"),
|
||
|
|
("MBPP", "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"),
|
||
|
|
]:
|
||
|
|
files = sorted(glob.glob(pat))
|
||
|
|
if files:
|
||
|
|
with open(files[-1]) as f:
|
||
|
|
d = json.load(f)
|
||
|
|
score = d.get("pass@1", d)
|
||
|
|
print(f" {label}: pass@1 = {score}")
|
||
|
|
else:
|
||
|
|
print(f" {label}: no result file found")
|
||
|
|
PYEOF
|
||
|
|
echo "======================================="
|