DANTE-Mosaic-3.5B/evaluation/slurm_code_eval.slurm

#!/bin/bash
#SBATCH --job-name=dante_code_eval
#SBATCH --account=AIFAC_F02_254_0
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=64G
#SBATCH --time=04:00:00
#SBATCH --output=evaluation/slurm_logs/code_eval_%j.out
#SBATCH --error=evaluation/slurm_logs/code_eval_%j.err

# ============================================================================
# DANTE-Mosaic-3.5B — Canonical code evaluation
# Leonardo Booster, 1x A100-40GB
#
# Tasks: HumanEval (pass@1), MBPP (pass@1)
# Harness: bigcode-evaluation-harness
# All outputs -> evaluation/results/canonical/
#
# WARNING: HumanEval executes generated Python code.
# bigcode-evaluation-harness uses a sandboxed subprocess per sample.
# Do NOT run --allow_code_execution outside of a secure environment.
# Leonardo compute nodes are isolated — this is acceptable here.
# ============================================================================

set -euo pipefail

# ─── Environment ─────────────────────────────────────────────────────────────
module purge
module load cuda/12.4 python/3.11.7

export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
export HF_HUB_CACHE="${HF_HOME}/hub"
export TOKENIZERS_PARALLELISM=false

# ─── Config ──────────────────────────────────────────────────────────────────
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RESULTS="evaluation/results/canonical"
BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness"
mkdir -p "${RESULTS}"

echo "======================================="
echo "DANTE-Mosaic-3.5B — Canonical code eval"
echo "Model:     ${MODEL}"
echo "Job ID:    ${SLURM_JOB_ID}"
echo "Timestamp: ${TIMESTAMP}"
echo "======================================="

# ─── Install bigcode harness if not present ───────────────────────────────────
if [ ! -d "${BIGCODE_DIR}" ]; then
    echo "Cloning bigcode-evaluation-harness..."
    git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \
        "${BIGCODE_DIR}"
    pip install --quiet -e "${BIGCODE_DIR}"
else
    echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}"
    pip install --quiet -e "${BIGCODE_DIR}"
fi

# ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ────────────────────────
echo ""
echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..."
HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}"
mkdir -p "${HE_OUT}"

accelerate launch "${BIGCODE_DIR}/main.py" \
    --model "${MODEL}" \
    --tasks humaneval \
    --do_sample False \
    --temperature 0.0 \
    --n_samples 1 \
    --batch_size 8 \
    --allow_code_execution \
    --precision bf16 \
    --trust_remote_code \
    --save_generations \
    --save_generations_path "${HE_OUT}/humaneval_generations.json" \
    --metric_output_path "${HE_OUT}/humaneval_metrics.json" \
    2>&1 | tee "${HE_OUT}/humaneval.log"

echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json"

# ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ─────────────────────────────
echo ""
echo ">>> MBPP pass@1 (374 problems, greedy decoding)..."
MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}"
mkdir -p "${MBPP_OUT}"

accelerate launch "${BIGCODE_DIR}/main.py" \
    --model "${MODEL}" \
    --tasks mbpp \
    --do_sample False \
    --temperature 0.0 \
    --n_samples 1 \
    --batch_size 8 \
    --allow_code_execution \
    --precision bf16 \
    --trust_remote_code \
    --save_generations \
    --save_generations_path "${MBPP_OUT}/mbpp_generations.json" \
    --metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \
    2>&1 | tee "${MBPP_OUT}/mbpp.log"

echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json"

# ─── Summary ─────────────────────────────────────────────────────────────────
echo ""
echo "======================================="
echo "CODE EVAL COMPLETE"
python3 - <<'PYEOF'
import json, glob

for label, pat in [
    ("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"),
    ("MBPP",      "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"),
]:
    files = sorted(glob.glob(pat))
    if files:
        with open(files[-1]) as f:
            d = json.load(f)
        score = d.get("pass@1", d)
        print(f"  {label}: pass@1 = {score}")
    else:
        print(f"  {label}: no result file found")
PYEOF
echo "======================================="
初始化项目，由ModelHub XC社区提供模型 Model: OdaxAI/DANTE-Mosaic-3.5B Source: Original Platform 2026-05-14 15:44:10 +08:00			`#!/bin/bash`
			`#SBATCH --job-name=dante_code_eval`
			`#SBATCH --account=AIFAC_F02_254_0`
			`#SBATCH --partition=boost_usr_prod`
			`#SBATCH --nodes=1`
			`#SBATCH --ntasks-per-node=1`
			`#SBATCH --gpus-per-node=1`
			`#SBATCH --cpus-per-task=8`
			`#SBATCH --mem=64G`
			`#SBATCH --time=04:00:00`
			`#SBATCH --output=evaluation/slurm_logs/code_eval_%j.out`
			`#SBATCH --error=evaluation/slurm_logs/code_eval_%j.err`

			`# ============================================================================`
			`# DANTE-Mosaic-3.5B — Canonical code evaluation`
			`# Leonardo Booster, 1x A100-40GB`
			`#`
			`# Tasks: HumanEval (pass@1), MBPP (pass@1)`
			`# Harness: bigcode-evaluation-harness`
			`# All outputs -> evaluation/results/canonical/`
			`#`
			`# WARNING: HumanEval executes generated Python code.`
			`# bigcode-evaluation-harness uses a sandboxed subprocess per sample.`
			`# Do NOT run --allow_code_execution outside of a secure environment.`
			`# Leonardo compute nodes are isolated — this is acceptable here.`
			`# ============================================================================`

			`set -euo pipefail`

			`# ─── Environment ─────────────────────────────────────────────────────────────`
			`module purge`
			`module load cuda/12.4 python/3.11.7`

			`export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"`
			`export HF_DATASETS_CACHE="${HF_HOME}/datasets"`
			`export TRANSFORMERS_CACHE="${HF_HOME}/transformers"`
			`export HF_HUB_CACHE="${HF_HOME}/hub"`
			`export TOKENIZERS_PARALLELISM=false`

			`# ─── Config ──────────────────────────────────────────────────────────────────`
			`MODEL="OdaxAI/DANTE-Mosaic-3.5B"`
			`TIMESTAMP=$(date +%Y%m%d_%H%M%S)`
			`RESULTS="evaluation/results/canonical"`
			`BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness"`
			`mkdir -p "${RESULTS}"`

			`echo "======================================="`
			`echo "DANTE-Mosaic-3.5B — Canonical code eval"`
			`echo "Model: ${MODEL}"`
			`echo "Job ID: ${SLURM_JOB_ID}"`
			`echo "Timestamp: ${TIMESTAMP}"`
			`echo "======================================="`

			`# ─── Install bigcode harness if not present ───────────────────────────────────`
			`if [ ! -d "${BIGCODE_DIR}" ]; then`
			`echo "Cloning bigcode-evaluation-harness..."`
			`git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \`
			`"${BIGCODE_DIR}"`
			`pip install --quiet -e "${BIGCODE_DIR}"`
			`else`
			`echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}"`
			`pip install --quiet -e "${BIGCODE_DIR}"`
			`fi`

			`# ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ────────────────────────`
			`echo ""`
			`echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..."`
			`HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}"`
			`mkdir -p "${HE_OUT}"`

			`accelerate launch "${BIGCODE_DIR}/main.py" \`
			`--model "${MODEL}" \`
			`--tasks humaneval \`
			`--do_sample False \`
			`--temperature 0.0 \`
			`--n_samples 1 \`
			`--batch_size 8 \`
			`--allow_code_execution \`
			`--precision bf16 \`
			`--trust_remote_code \`
			`--save_generations \`
			`--save_generations_path "${HE_OUT}/humaneval_generations.json" \`
			`--metric_output_path "${HE_OUT}/humaneval_metrics.json" \`
			`2>&1 \| tee "${HE_OUT}/humaneval.log"`

			`echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json"`

			`# ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ─────────────────────────────`
			`echo ""`
			`echo ">>> MBPP pass@1 (374 problems, greedy decoding)..."`
			`MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}"`
			`mkdir -p "${MBPP_OUT}"`

			`accelerate launch "${BIGCODE_DIR}/main.py" \`
			`--model "${MODEL}" \`
			`--tasks mbpp \`
			`--do_sample False \`
			`--temperature 0.0 \`
			`--n_samples 1 \`
			`--batch_size 8 \`
			`--allow_code_execution \`
			`--precision bf16 \`
			`--trust_remote_code \`
			`--save_generations \`
			`--save_generations_path "${MBPP_OUT}/mbpp_generations.json" \`
			`--metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \`
			`2>&1 \| tee "${MBPP_OUT}/mbpp.log"`

			`echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json"`

			`# ─── Summary ─────────────────────────────────────────────────────────────────`
			`echo ""`
			`echo "======================================="`
			`echo "CODE EVAL COMPLETE"`
			`python3 - <<'PYEOF'`
			`import json, glob`

			`for label, pat in [`
			`("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"),`
			`("MBPP", "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"),`
			`]:`
			`files = sorted(glob.glob(pat))`
			`if files:`
			`with open(files[-1]) as f:`
			`d = json.load(f)`
			`score = d.get("pass@1", d)`
			`print(f" {label}: pass@1 = {score}")`
			`else:`
			`print(f" {label}: no result file found")`
			`PYEOF`
			`echo "======================================="`