初始化项目，由ModelHub XC社区提供模型

Model: OdaxAI/DANTE-Mosaic-3.5B Source: Original Platform
2026-05-14 15:44:10 +08:00
commit b0ba87406b
41 changed files with 1638 additions and 0 deletions
--- a/evaluation/slurm_code_eval.slurm
+++ b/evaluation/slurm_code_eval.slurm
@@ -0,0 +1,131 @@
+#!/bin/bash
+#SBATCH --job-name=dante_code_eval
+#SBATCH --account=AIFAC_F02_254_0
+#SBATCH --partition=boost_usr_prod
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --time=04:00:00
+#SBATCH --output=evaluation/slurm_logs/code_eval_%j.out
+#SBATCH --error=evaluation/slurm_logs/code_eval_%j.err
+
+# ============================================================================
+# DANTE-Mosaic-3.5B — Canonical code evaluation
+# Leonardo Booster, 1x A100-40GB
+#
+# Tasks: HumanEval (pass@1), MBPP (pass@1)
+# Harness: bigcode-evaluation-harness
+# All outputs -> evaluation/results/canonical/
+#
+# WARNING: HumanEval executes generated Python code.
+# bigcode-evaluation-harness uses a sandboxed subprocess per sample.
+# Do NOT run --allow_code_execution outside of a secure environment.
+# Leonardo compute nodes are isolated — this is acceptable here.
+# ============================================================================
+
+set -euo pipefail
+
+# ─── Environment ─────────────────────────────────────────────────────────────
+module purge
+module load cuda/12.4 python/3.11.7
+
+export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
+export HF_HUB_CACHE="${HF_HOME}/hub"
+export TOKENIZERS_PARALLELISM=false
+
+# ─── Config ──────────────────────────────────────────────────────────────────
+MODEL="OdaxAI/DANTE-Mosaic-3.5B"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+RESULTS="evaluation/results/canonical"
+BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness"
+mkdir -p "${RESULTS}"
+
+echo "======================================="
+echo "DANTE-Mosaic-3.5B — Canonical code eval"
+echo "Model:     ${MODEL}"
+echo "Job ID:    ${SLURM_JOB_ID}"
+echo "Timestamp: ${TIMESTAMP}"
+echo "======================================="
+
+# ─── Install bigcode harness if not present ───────────────────────────────────
+if [ ! -d "${BIGCODE_DIR}" ]; then
+    echo "Cloning bigcode-evaluation-harness..."
+    git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \
+        "${BIGCODE_DIR}"
+    pip install --quiet -e "${BIGCODE_DIR}"
+else
+    echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}"
+    pip install --quiet -e "${BIGCODE_DIR}"
+fi
+
+# ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ────────────────────────
+echo ""
+echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..."
+HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}"
+mkdir -p "${HE_OUT}"
+
+accelerate launch "${BIGCODE_DIR}/main.py" \
+    --model "${MODEL}" \
+    --tasks humaneval \
+    --do_sample False \
+    --temperature 0.0 \
+    --n_samples 1 \
+    --batch_size 8 \
+    --allow_code_execution \
+    --precision bf16 \
+    --trust_remote_code \
+    --save_generations \
+    --save_generations_path "${HE_OUT}/humaneval_generations.json" \
+    --metric_output_path "${HE_OUT}/humaneval_metrics.json" \
+    2>&1 | tee "${HE_OUT}/humaneval.log"
+
+echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json"
+
+# ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ─────────────────────────────
+echo ""
+echo ">>> MBPP pass@1 (374 problems, greedy decoding)..."
+MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}"
+mkdir -p "${MBPP_OUT}"
+
+accelerate launch "${BIGCODE_DIR}/main.py" \
+    --model "${MODEL}" \
+    --tasks mbpp \
+    --do_sample False \
+    --temperature 0.0 \
+    --n_samples 1 \
+    --batch_size 8 \
+    --allow_code_execution \
+    --precision bf16 \
+    --trust_remote_code \
+    --save_generations \
+    --save_generations_path "${MBPP_OUT}/mbpp_generations.json" \
+    --metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \
+    2>&1 | tee "${MBPP_OUT}/mbpp.log"
+
+echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json"
+
+# ─── Summary ─────────────────────────────────────────────────────────────────
+echo ""
+echo "======================================="
+echo "CODE EVAL COMPLETE"
+python3 - <<'PYEOF'
+import json, glob
+
+for label, pat in [
+    ("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"),
+    ("MBPP",      "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"),
+]:
+    files = sorted(glob.glob(pat))
+    if files:
+        with open(files[-1]) as f:
+            d = json.load(f)
+        score = d.get("pass@1", d)
+        print(f"  {label}: pass@1 = {score}")
+    else:
+        print(f"  {label}: no result file found")
+PYEOF
+echo "======================================="