初始化项目,由ModelHub XC社区提供模型
Model: OdaxAI/DANTE-Mosaic-3.5B Source: Original Platform
This commit is contained in:
131
evaluation/slurm_code_eval.slurm
Normal file
131
evaluation/slurm_code_eval.slurm
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=dante_code_eval
|
||||
#SBATCH --account=AIFAC_F02_254_0
|
||||
#SBATCH --partition=boost_usr_prod
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --gpus-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --time=04:00:00
|
||||
#SBATCH --output=evaluation/slurm_logs/code_eval_%j.out
|
||||
#SBATCH --error=evaluation/slurm_logs/code_eval_%j.err
|
||||
|
||||
# ============================================================================
|
||||
# DANTE-Mosaic-3.5B — Canonical code evaluation
|
||||
# Leonardo Booster, 1x A100-40GB
|
||||
#
|
||||
# Tasks: HumanEval (pass@1), MBPP (pass@1)
|
||||
# Harness: bigcode-evaluation-harness
|
||||
# All outputs -> evaluation/results/canonical/
|
||||
#
|
||||
# WARNING: HumanEval executes generated Python code.
|
||||
# bigcode-evaluation-harness uses a sandboxed subprocess per sample.
|
||||
# Do NOT run --allow_code_execution outside of a secure environment.
|
||||
# Leonardo compute nodes are isolated — this is acceptable here.
|
||||
# ============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ─── Environment ─────────────────────────────────────────────────────────────
|
||||
module purge
|
||||
module load cuda/12.4 python/3.11.7
|
||||
|
||||
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
|
||||
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
|
||||
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
|
||||
export HF_HUB_CACHE="${HF_HOME}/hub"
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# ─── Config ──────────────────────────────────────────────────────────────────
|
||||
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
RESULTS="evaluation/results/canonical"
|
||||
BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness"
|
||||
mkdir -p "${RESULTS}"
|
||||
|
||||
echo "======================================="
|
||||
echo "DANTE-Mosaic-3.5B — Canonical code eval"
|
||||
echo "Model: ${MODEL}"
|
||||
echo "Job ID: ${SLURM_JOB_ID}"
|
||||
echo "Timestamp: ${TIMESTAMP}"
|
||||
echo "======================================="
|
||||
|
||||
# ─── Install bigcode harness if not present ───────────────────────────────────
|
||||
if [ ! -d "${BIGCODE_DIR}" ]; then
|
||||
echo "Cloning bigcode-evaluation-harness..."
|
||||
git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \
|
||||
"${BIGCODE_DIR}"
|
||||
pip install --quiet -e "${BIGCODE_DIR}"
|
||||
else
|
||||
echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}"
|
||||
pip install --quiet -e "${BIGCODE_DIR}"
|
||||
fi
|
||||
|
||||
# ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ────────────────────────
|
||||
echo ""
|
||||
echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..."
|
||||
HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}"
|
||||
mkdir -p "${HE_OUT}"
|
||||
|
||||
accelerate launch "${BIGCODE_DIR}/main.py" \
|
||||
--model "${MODEL}" \
|
||||
--tasks humaneval \
|
||||
--do_sample False \
|
||||
--temperature 0.0 \
|
||||
--n_samples 1 \
|
||||
--batch_size 8 \
|
||||
--allow_code_execution \
|
||||
--precision bf16 \
|
||||
--trust_remote_code \
|
||||
--save_generations \
|
||||
--save_generations_path "${HE_OUT}/humaneval_generations.json" \
|
||||
--metric_output_path "${HE_OUT}/humaneval_metrics.json" \
|
||||
2>&1 | tee "${HE_OUT}/humaneval.log"
|
||||
|
||||
echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json"
|
||||
|
||||
# ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ─────────────────────────────
|
||||
echo ""
|
||||
echo ">>> MBPP pass@1 (374 problems, greedy decoding)..."
|
||||
MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}"
|
||||
mkdir -p "${MBPP_OUT}"
|
||||
|
||||
accelerate launch "${BIGCODE_DIR}/main.py" \
|
||||
--model "${MODEL}" \
|
||||
--tasks mbpp \
|
||||
--do_sample False \
|
||||
--temperature 0.0 \
|
||||
--n_samples 1 \
|
||||
--batch_size 8 \
|
||||
--allow_code_execution \
|
||||
--precision bf16 \
|
||||
--trust_remote_code \
|
||||
--save_generations \
|
||||
--save_generations_path "${MBPP_OUT}/mbpp_generations.json" \
|
||||
--metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \
|
||||
2>&1 | tee "${MBPP_OUT}/mbpp.log"
|
||||
|
||||
echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json"
|
||||
|
||||
# ─── Summary ─────────────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo "======================================="
|
||||
echo "CODE EVAL COMPLETE"
|
||||
python3 - <<'PYEOF'
|
||||
import json, glob
|
||||
|
||||
for label, pat in [
|
||||
("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"),
|
||||
("MBPP", "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"),
|
||||
]:
|
||||
files = sorted(glob.glob(pat))
|
||||
if files:
|
||||
with open(files[-1]) as f:
|
||||
d = json.load(f)
|
||||
score = d.get("pass@1", d)
|
||||
print(f" {label}: pass@1 = {score}")
|
||||
else:
|
||||
print(f" {label}: no result file found")
|
||||
PYEOF
|
||||
echo "======================================="
|
||||
Reference in New Issue
Block a user