#!/bin/bash
#SBATCH --job-name=dante_code_eval
#SBATCH --account=AIFAC_F02_254_0
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=64G
#SBATCH --time=04:00:00
#SBATCH --output=evaluation/slurm_logs/code_eval_%j.out
#SBATCH --error=evaluation/slurm_logs/code_eval_%j.err

# ============================================================================
# DANTE-Mosaic-3.5B — Canonical code evaluation
# Leonardo Booster, 1x A100-40GB
#
# Tasks: HumanEval (pass@1), MBPP (pass@1)
# Harness: bigcode-evaluation-harness
# All outputs -> evaluation/results/canonical/
#
# WARNING: HumanEval executes generated Python code.
# bigcode-evaluation-harness uses a sandboxed subprocess per sample.
# Do NOT run --allow_code_execution outside of a secure environment.
# Leonardo compute nodes are isolated — this is acceptable here.
# ============================================================================

set -euo pipefail

# ─── Environment ─────────────────────────────────────────────────────────────
module purge
module load cuda/12.4 python/3.11.7

export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
export HF_HUB_CACHE="${HF_HOME}/hub"
export TOKENIZERS_PARALLELISM=false

# ─── Config ──────────────────────────────────────────────────────────────────
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RESULTS="evaluation/results/canonical"
BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness"
mkdir -p "${RESULTS}"

echo "======================================="
echo "DANTE-Mosaic-3.5B — Canonical code eval"
echo "Model:     ${MODEL}"
echo "Job ID:    ${SLURM_JOB_ID}"
echo "Timestamp: ${TIMESTAMP}"
echo "======================================="

# ─── Install bigcode harness if not present ───────────────────────────────────
if [ ! -d "${BIGCODE_DIR}" ]; then
    echo "Cloning bigcode-evaluation-harness..."
    git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \
        "${BIGCODE_DIR}"
    pip install --quiet -e "${BIGCODE_DIR}"
else
    echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}"
    pip install --quiet -e "${BIGCODE_DIR}"
fi

# ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ────────────────────────
echo ""
echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..."
HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}"
mkdir -p "${HE_OUT}"

accelerate launch "${BIGCODE_DIR}/main.py" \
    --model "${MODEL}" \
    --tasks humaneval \
    --do_sample False \
    --temperature 0.0 \
    --n_samples 1 \
    --batch_size 8 \
    --allow_code_execution \
    --precision bf16 \
    --trust_remote_code \
    --save_generations \
    --save_generations_path "${HE_OUT}/humaneval_generations.json" \
    --metric_output_path "${HE_OUT}/humaneval_metrics.json" \
    2>&1 | tee "${HE_OUT}/humaneval.log"

echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json"

# ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ─────────────────────────────
echo ""
echo ">>> MBPP pass@1 (374 problems, greedy decoding)..."
MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}"
mkdir -p "${MBPP_OUT}"

accelerate launch "${BIGCODE_DIR}/main.py" \
    --model "${MODEL}" \
    --tasks mbpp \
    --do_sample False \
    --temperature 0.0 \
    --n_samples 1 \
    --batch_size 8 \
    --allow_code_execution \
    --precision bf16 \
    --trust_remote_code \
    --save_generations \
    --save_generations_path "${MBPP_OUT}/mbpp_generations.json" \
    --metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \
    2>&1 | tee "${MBPP_OUT}/mbpp.log"

echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json"

# ─── Summary ─────────────────────────────────────────────────────────────────
echo ""
echo "======================================="
echo "CODE EVAL COMPLETE"
python3 - <<'PYEOF'
import json, glob

for label, pat in [
    ("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"),
    ("MBPP",      "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"),
]:
    files = sorted(glob.glob(pat))
    if files:
        with open(files[-1]) as f:
            d = json.load(f)
        score = d.get("pass@1", d)
        print(f"  {label}: pass@1 = {score}")
    else:
        print(f"  {label}: no result file found")
PYEOF
echo "======================================="