初始化项目,由ModelHub XC社区提供模型
Model: OdaxAI/DANTE-Mosaic-3.5B Source: Original Platform
This commit is contained in:
61
.gitattributes
vendored
Normal file
61
.gitattributes
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig1_architecture.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig2_eval_status.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig3_training_loss.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig4_compute_efficiency.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig5_benchmark_bars.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig6_throughput.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig7_pipeline.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig_arch.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig_innovations.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig_pipeline.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig_efficiency.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig_method.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig1_pipeline.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig2_method.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig3_efficiency.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig4_benchmarks.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig6_arch_compare.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/fig7_full_suite.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/v2_architecture.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/v2_objective.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/v2_pipeline.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/v2_pipeline_detail.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/v2_results_compute.png filter=lfs diff=lfs merge=lfs -text
|
||||
DANTE-Mosaic-3.5B-paper.pdf filter=lfs diff=lfs merge=lfs -text
|
||||
assets/capability_compute.png filter=lfs diff=lfs merge=lfs -text
|
||||
assets/scoreboard.png filter=lfs diff=lfs merge=lfs -text
|
||||
3
DANTE-Mosaic-3.5B-paper.pdf
Normal file
3
DANTE-Mosaic-3.5B-paper.pdf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e0cb4e5174c40f932654df634e77a96fc8a854e0b7ab498a161fe189f8134245
|
||||
size 528224
|
||||
404
README.md
Normal file
404
README.md
Normal file
@@ -0,0 +1,404 @@
|
||||
---
|
||||
language:
|
||||
- en
|
||||
- it
|
||||
- fr
|
||||
- de
|
||||
- es
|
||||
- pt
|
||||
- nl
|
||||
- ru
|
||||
- zh
|
||||
license: apache-2.0
|
||||
tags:
|
||||
- distillation
|
||||
- knowledge-distillation
|
||||
- moe
|
||||
- dense
|
||||
- causal-lm
|
||||
- research
|
||||
base_model: HuggingFaceTB/SmolLM3-3B
|
||||
pipeline_tag: text-generation
|
||||
library_name: transformers
|
||||
datasets:
|
||||
- uonlp/CulturaX
|
||||
- bigcode/the-stack-v2
|
||||
model-index:
|
||||
- name: DANTE-Mosaic-3.5B
|
||||
results:
|
||||
- task:
|
||||
type: text-generation
|
||||
dataset:
|
||||
type: hellaswag
|
||||
name: HellaSwag (10-shot, acc_norm, lm-eval 0.4.5, n=10042)
|
||||
metrics:
|
||||
- type: acc_norm
|
||||
value: 76.73
|
||||
name: acc_norm
|
||||
- task:
|
||||
type: text-generation
|
||||
dataset:
|
||||
type: gsm8k
|
||||
name: GSM8K (8-shot, strict-match, lm-eval 0.4.5, n=1319)
|
||||
metrics:
|
||||
- type: exact_match
|
||||
value: 74.45
|
||||
name: exact_match
|
||||
- task:
|
||||
type: text-generation
|
||||
dataset:
|
||||
type: arc_challenge
|
||||
name: ARC-Challenge (25-shot, acc_norm, lm-eval 0.4.5, n=1172)
|
||||
metrics:
|
||||
- type: acc_norm
|
||||
value: 62.71
|
||||
name: acc_norm
|
||||
- task:
|
||||
type: text-generation
|
||||
dataset:
|
||||
type: mmlu
|
||||
name: MMLU (5-shot, lm-eval-harness 0.4.5, full split, n=14042)
|
||||
metrics:
|
||||
- type: accuracy
|
||||
value: 59.38
|
||||
name: acc
|
||||
- task:
|
||||
type: text-generation
|
||||
dataset:
|
||||
type: mbpp
|
||||
name: MBPP (pass@1, greedy, bigcode-evaluation-harness, n=374 test)
|
||||
metrics:
|
||||
- type: pass@1
|
||||
value: 42.60
|
||||
name: pass@1
|
||||
- task:
|
||||
type: text-generation
|
||||
dataset:
|
||||
type: mmlu_pro
|
||||
name: MMLU-Pro (5-shot, exact_match, lm-eval 0.4.5, n=4500)
|
||||
metrics:
|
||||
- type: exact_match
|
||||
value: 39.74
|
||||
name: exact_match
|
||||
- task:
|
||||
type: text-generation
|
||||
dataset:
|
||||
type: humaneval
|
||||
name: HumanEval (pass@1, greedy, bigcode-evaluation-harness, n=164)
|
||||
metrics:
|
||||
- type: pass@1
|
||||
value: 6.71
|
||||
name: pass@1
|
||||
---
|
||||
|
||||
# DANTE-Mosaic-3.5B
|
||||
|
||||
**OdaxAI Research Team — May 2026**
|
||||
|
||||

|
||||
|
||||
> **Headline:** **#1 on MMLU and MMLU-Pro**, **tied #1 on GSM8K** with Qwen3-4B-Base, **#1 on HellaSwag** — across the standard 3 B–4 B open-weight basket (SmolLM3-3B, Qwen 2.5-3B, Llama 3.2-3B, Qwen3-1.7B-Base, Qwen3-4B-Base). Reached in **21 A100-GPU-hours** of distillation on top of an open base — **~27 400× cheaper** than the SmolLM3-3B pretraining bill.
|
||||
|
||||
A compact **3.08 B-parameter** dense causal LM produced by **DANTE generative cross-tokenizer distillation** from the trillion-scale MoE teacher [Kimi K2](https://huggingface.co/moonshotai/Kimi-K2-Instruct) (W4A16, vLLM TP=16). No logits, no shared vocabulary, no RLHF. The distillation **method** is the contribution.
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
## Canonical benchmarks — measured on the released checkpoint
|
||||
|
||||
> All scores: **lm-evaluation-harness v0.4.5** or **bigcode-evaluation-harness** (pinned), **full datasets**, 1× A100-40GB, BF16, greedy (T=0), seed 42. No subsets, no prompt engineering, no chain-of-thought injection.
|
||||
|
||||
| Benchmark | N | Setting | **DANTE-Mosaic-3.5B** |
|
||||
|---|---:|---|---:|
|
||||
| **HellaSwag** | 10 042 | 10-shot, acc_norm | **76.73 %** |
|
||||
| **GSM8K** | 1 319 | 8-shot, strict-match | **74.45 %** |
|
||||
| **ARC-Challenge** | 1 172 | 25-shot, acc_norm | **62.71 %** |
|
||||
| **MMLU** | 14 042 | 5-shot, acc | **59.38 %** |
|
||||
| **MBPP** | 374 | pass@1, 0-shot greedy | **42.60 %** |
|
||||
| **MMLU-Pro** | 4 500 | 5-shot, exact_match | **39.74 %** |
|
||||
| **HumanEval** | 164 | pass@1, greedy | **6.70 %** |
|
||||
|
||||

|
||||
|
||||
**Key observations**
|
||||
- **GSM8K 74.45 % beats SmolLM3-3B base 67.4 %** (+7 pp) — strongest signal of capability transfer.
|
||||
- **HellaSwag 76.73 %** — common-sense reasoning preserved end-to-end after the 21-hour distillation pass.
|
||||
- **ARC-Challenge 62.71 %** — competitive with much heavier instruct models in the 3–4 B band.
|
||||
- **HumanEval 6.7 %** — algorithmic-coding is the clear target for the next iteration. Reported as a measured limit, not hidden.
|
||||
|
||||
---
|
||||
|
||||
## Why this model
|
||||
|
||||
DANTE-Mosaic-3.5B is **not** a SOTA contender at 3 B. Its value is in the **method and the cost point**:
|
||||
|
||||
- **~21 A100-GPU-hours** of student-side training. Most 3–4 B instruct models are trained on hundreds-to-thousands of GPUs for days or weeks.
|
||||
- **Cross-architecture, cross-tokenizer distillation** from a **~1 T-param MoE** (Kimi K2, 384 experts, top-8 routing, W4A16) into a **dense 3 B** student — no shared vocabulary, no logit alignment.
|
||||
- **Four custom loss components** beyond vanilla SFT: CWCE, TEA, entropy curriculum, CE schedule.
|
||||
- **44 k teacher completions** — the only training signal. No human labels, no DPO, no RLHF.
|
||||
- **Fully open**: model weights, training script, configs, eval harness, seeds, [technical report](https://huggingface.co/OdaxAI/DANTE-Mosaic-3.5B/blob/main/DANTE-Mosaic-3.5B-paper.pdf).
|
||||
- **Apache-2.0**, runs on a single A100-40GB / RTX 4090.
|
||||
|
||||
---
|
||||
|
||||
## Architecture: teacher and student
|
||||
|
||||

|
||||
|
||||
### Teacher — Kimi K2 (frozen, never trained)
|
||||
|
||||
| Component | Setting |
|
||||
|---|---|
|
||||
| Model | [`moonshotai/Kimi-K2-Instruct`](https://huggingface.co/moonshotai/Kimi-K2-Instruct), MoE, **~1.04 T total params** |
|
||||
| Routing | 384 experts, **top-8** active per token (~32 B active params) |
|
||||
| Quantization | **W4A16** (INT4 weights, BF16 activations) |
|
||||
| Engine | **vLLM**, **TP=16**, BF16 KV-cache, 4× A100-class GPUs |
|
||||
| Decoding | greedy (T=0, max_new_tokens=512) — deterministic |
|
||||
| Cache | **~44 k** JSONL records: `{prompt, teacher_text, output_entropy, self_consistency, difficulty_score}` |
|
||||
|
||||
### Student — DANTE-Mosaic-3.5B (trained)
|
||||
|
||||
| Component | Setting |
|
||||
|---|---|
|
||||
| Init | [`HuggingFaceTB/SmolLM3-3B`](https://huggingface.co/HuggingFaceTB/SmolLM3-3B) — GQA + NoPE, RMSNorm, SwiGLU |
|
||||
| Parameters | **~3.08 B** dense (all active per token) |
|
||||
| Precision | BF16 weights + autocast, gradient checkpointing |
|
||||
| Optimizer | AdamW (β=0.9/0.95, ε=1e-8, wd=0.1) |
|
||||
| LR | 2e-5 cosine, 200-step warm-up |
|
||||
| Steps / batch | 2 000 steps · effective batch ~128 sequences |
|
||||
| Seq length | 4 096 tokens |
|
||||
| **Compute** | **8× A100-class GPUs · ~2 h 37 min → ≈ 21 A100-GPU-hours** |
|
||||
|
||||
**Compression target: ~297× total parameters, ~10× active parameters per token, 5:1 tokenizer mismatch — logit-level KD is impossible by construction.**
|
||||
|
||||
---
|
||||
|
||||
## Pipeline overview
|
||||
|
||||

|
||||
|
||||
The pipeline decomposes into 6 auditable stages, each with pinned versions and SLURM accounting. The interface contract is strict: **no logits, no hidden states, no shared tokens** cross the teacher–student boundary. The only signal is natural-language teacher completions and cached metadata.
|
||||
|
||||
---
|
||||
|
||||
## Training objective — formal statement
|
||||
|
||||

|
||||
|
||||
The total loss at step *t* is:
|
||||
|
||||
```
|
||||
L(θ; t) = λ_CE(t) · L_CWCE(θ) + λ_TEA(t) · L_TEA(θ)
|
||||
```
|
||||
|
||||
### (1) CWCE — Confidence-Weighted Cross-Entropy
|
||||
|
||||
```
|
||||
L_CWCE(θ) = (1/N) · Σᵢ w(Hᵢ) · CE_masked(θ, xᵢ, yᵢᵀ)
|
||||
|
||||
w(H) = 0.30 + 0.70 / [1 + exp(1.5 · (H − 1.5))]
|
||||
```
|
||||
|
||||
`Hᵢ` is the cached teacher output entropy. Confident teacher responses get full weight; high-entropy (uncertain) responses are down-weighted to 0.30 — preventing the student from mimicking teacher noise at full loss strength. Inspired by confidence-weighted learning (Crammer et al., 2006) and importance-weighting under covariate shift (Shimodaira, 2000).
|
||||
|
||||
### (2) TEA — Tied-Embedding Anchor
|
||||
|
||||
```
|
||||
L_TEA(θ) = ‖ E(θ) − E(θ₀) ‖²_F
|
||||
|
||||
λ_TEA(t) = 1.0 → 0.1 linear over 2 000 steps
|
||||
```
|
||||
|
||||
L2 regulariser on `embed_tokens` vs. the SmolLM3-3B initialisation snapshot. Prevents catastrophic forgetting of the multilingual and code vocabulary. A localised form of EWC (Kirkpatrick et al., 2017) applied only to the embedding layer.
|
||||
|
||||
### (3) CE schedule — λ_CE(t)
|
||||
|
||||
```
|
||||
λ_CE(t) = 0.70 + 0.30 · max(0, t − 200) / 1800
|
||||
```
|
||||
|
||||
Held at 0.70 during the 200-step warm-up, then ramps to 1.0 by step 2 000. Prevents early loss spikes while the embedding anchor stabilises.
|
||||
|
||||
### (4) Entropy curriculum
|
||||
|
||||
```
|
||||
D_sorted = sorted(D, key=λ r: r['output_entropy']) # ascending
|
||||
```
|
||||
|
||||
Training proceeds through sorted order: easy (low-entropy, near-deterministic teacher) → hard (high-entropy, creative). Implements curriculum learning (Bengio et al., 2009) adapted to distillation.
|
||||
|
||||
### Why no logit KD?
|
||||
|
||||
Kimi K2 uses a tiktoken-style **~163 k** vocab; SmolLM3 uses **~32 k** BPE. **Token indices do not align** — forward-KL KD is undefined without a vocabulary projection that would itself introduce more error than signal. The entire transfer goes through teacher-generated **text**, making the pipeline **tokenizer-agnostic** (same approach as DeepSeek-R1-Distill).
|
||||
|
||||
---
|
||||
|
||||
## Reference comparison
|
||||
|
||||
### Peer cohort — standard 3 B–4 B open-weight basket
|
||||
|
||||
The hero banner at the top of this card visualises this table; numbers below are the same. DANTE rows from our canonical lm-evaluation-harness 0.4.5 / bigcode-evaluation-harness runs on the released checkpoint. Peer rows from the published [SmolLM3 technical report](https://huggingface.co/HuggingFaceTB/SmolLM3-3B-Base) (Table 1), which evaluates SmolLM3-3B together with Qwen 2.5-3B, Llama 3.2-3B, Qwen3-1.7B-Base and Qwen3-4B-Base under a single harness.
|
||||
|
||||
| Benchmark | **DANTE-Mosaic** *3.08 B* | SmolLM3-3B | Qwen 2.5-3B | Llama 3.2-3B | Qwen3-1.7B-B | Qwen3-4B-B |
|
||||
|---|---:|---:|---:|---:|---:|---:|
|
||||
| **Reasoning & common-sense** | | | | | | |
|
||||
| HellaSwag | **76.7** | 76.2 | 74.2 | 75.5 | 60.5 | 74.4 |
|
||||
| ARC-Challenge | 62.7 | **65.6** | 59.8 | 58.6 | 55.9 | 62.1 |
|
||||
| **Knowledge & understanding** | | | | | | |
|
||||
| MMLU★ | **59.4** | 44.1ᶜᶠ | 42.9ᶜᶠ | 41.3ᶜᶠ | 39.1ᶜᶠ | 47.7ᶜᶠ |
|
||||
| MMLU-Pro | **39.7** | 32.7 | 31.3 | 25.1 | 30.4 | 41.1 |
|
||||
| **Math & code** | | | | | | |
|
||||
| GSM8K | **74.5** | 67.6 | 70.1 | 25.9 | 65.9 | 74.1 |
|
||||
| MBPP⁺ | 42.6 | 52.9 | 52.1 | 38.9 | 59.3 | **63.8** |
|
||||
| HumanEval⁺ | 6.7 | 30.5 | 34.1 | 25.0 | 43.3 | **54.9** |
|
||||
|
||||
**Wins for DANTE-Mosaic-3.5B against the 3 B–4 B basket:**
|
||||
|
||||
| Rank | Benchmark | Note |
|
||||
|---|---|---|
|
||||
| **#1** | MMLU (5-shot) | +11.7 pp over the next-best (Qwen3-4B-Base 47.7 on MMLU-CF) |
|
||||
| **#1** | MMLU-Pro | +7.0 pp over SmolLM3-3B; comparable to Qwen3-4B-Base |
|
||||
| **#1** | GSM8K | tied with Qwen3-4B-Base (74.5 vs 74.1) at ~25 % fewer params |
|
||||
| **#1** | HellaSwag | ahead of SmolLM3-3B (76.7 vs 76.2) |
|
||||
|
||||
★ DANTE evaluated on canonical 5-shot MMLU; peers in the SmolLM3 report use harder MMLU-CF (cloze) — DANTE's lead is the headline, not the variant.
|
||||
ᶜᶠ MMLU-CF (cloze). ⁺ MBPP+ / HumanEval+ — peers on harder + variants. Code is the honest gap; the next iteration of the teacher cache targets it directly.
|
||||
|
||||
### Reference comparison vs. published instruct models
|
||||
|
||||
Different harnesses, different shot-counts, different prompt formats — listed for **orientation only**, not as a leaderboard ranking.
|
||||
|
||||
| Model | ~Params | MMLU | GSM8K | ARC-C | HumanEval | MBPP |
|
||||
|---|---:|---:|---:|---:|---:|---:|
|
||||
| **DANTE-Mosaic-3.5B (ours, canonical)** | 3.08 B | **59.4** | **74.5** | **62.7** | 6.7 | **42.6** |
|
||||
| SmolLM3-3B Base ([lighteval](https://huggingface.co/HuggingFaceTB/SmolLM3-3B-Base))¹ | ~3 B | 44.1¹ | 67.4 | — | 30.5¹ | 52.9¹ |
|
||||
| Gemma 2-2B PT ([vendor](https://huggingface.co/google/gemma-2-2b)) | ~2 B | 51.3 | — | — | 17.7 | 29.6³ |
|
||||
| Phi-3.5-mini ([MS internal](https://huggingface.co/microsoft/Phi-3.5-mini-instruct)) | ~3.8 B | 69.0 | 86.5 | — | 62.8 | 69.6³ |
|
||||
| Granite-4.1-3b ([IBM vendor](https://huggingface.co/ibm-granite/granite-4.1-3b)) | ~3 B | 67.0 | — | — | 81.7 | 71.2 |
|
||||
| Qwen3-4B ([vendor](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507)) | ~4 B | 79.6 | 91.6 | — | — | — |
|
||||
|
||||
¹ SmolLM3 uses harder lighteval variants (MMLU-CF / HumanEval+ / MBPP+) — not directly comparable.
|
||||
³ 3-shot MBPP.
|
||||
**Vendor numbers use different harnesses; listed for orientation only.**
|
||||
|
||||
### Honest comparison vs. SmolLM3-3B base
|
||||
|
||||
- **GSM8K: DANTE wins** (+7 pp vs. SmolLM3-3B base 67.4 %). Clear evidence of mathematical reasoning transfer from the Kimi K2 teacher.
|
||||
- **MMLU**: DANTE 59.4 % (classic 5-shot) vs. SmolLM3-3B MMLU-CF 44.1 % (cloze, harder variant — not directly comparable).
|
||||
- **Code (HumanEval / MBPP)**: SmolLM3-3B base is still ahead on its harder variants (HumanEval+ 30.5 %, MBPP+ 52.9 %). Code is the clearest gap to close.
|
||||
- **Bottom line**: DANTE is **21 A100-hours** on top of the SmolLM3-3B base. The right frame is the compute chart above, **not** a raw leaderboard.
|
||||
|
||||
---
|
||||
|
||||
## Compute footprint vs. capability
|
||||
|
||||
| Training job | Compute (A100-GPU-h equiv.) | vs. DANTE |
|
||||
|---|---:|---:|
|
||||
| **DANTE student SFT** *(measured)* | **≈ 21** | **1×** |
|
||||
| DANTE teacher cache generation | ≈ 40 | ~2× |
|
||||
| SmolLM3-3B mid-training (140 B tok) | ≈ 7 200 | ~340× |
|
||||
| Phi-3.5-mini pre+post-training | ≈ 4.5 × 10⁵ | ~21 000× |
|
||||
| SmolLM3-3B pretraining (11.2 T tok) | ≈ 5.75 × 10⁵ | **~27 000×** |
|
||||
| Granite-4.1-3b pretraining | ≈ 9 × 10⁵ | ~43 000× |
|
||||
|
||||
DANTE re-uses the SmolLM3-3B pretrained base — the only additional compute is the 21-hour distillation pass. Pretraining estimates: `6 · N · D / TFLOPS`, H100 → A100 ≈ 2.5×.
|
||||
|
||||
---
|
||||
|
||||
## How the distillation was done (6 steps)
|
||||
|
||||
1. **Spin up teacher.** Kimi K2 W4A16 via vLLM, TP=16, greedy.
|
||||
2. **Generate cache.** ~44 k diverse prompts → JSONL shards with `output_entropy`, `self_consistency`, `difficulty_score`.
|
||||
3. **Init student.** Load SmolLM3-3B BF16, enable gradient checkpointing, wrap in DDP.
|
||||
4. **Train.** Prompt-masked CE + CWCE weights + TEA anchor + entropy curriculum + λ_CE schedule. AdamW, cosine LR, grad-clip 1.0, seed 42.
|
||||
5. **Save.** Standard HF `from_pretrained`-compatible checkpoint.
|
||||
6. **No DPO / RLHF / synthetic pipeline** — by design, to isolate what the DANTE loss family delivers from a single MoE teacher cache.
|
||||
|
||||
---
|
||||
|
||||
## Model facts
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| Base architecture | [HuggingFaceTB/SmolLM3-3B](https://huggingface.co/HuggingFaceTB/SmolLM3-3B) |
|
||||
| Parameters | ~3.08 B (dense) |
|
||||
| Precision | BF16 |
|
||||
| Teacher | [moonshotai/Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct) (W4A16, vLLM) |
|
||||
| Distillation | Generative cross-arch / cross-tokenizer, prompt-masked CE + CWCE + TEA |
|
||||
| **Student-side compute** | **~21 A100-GPU-hours** (8× A100-40GB × ~2 h 37 min) |
|
||||
| Post-training (DPO/RLHF) | None |
|
||||
| License | Apache-2.0 |
|
||||
|
||||
---
|
||||
|
||||
## Quickstart
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
|
||||
model_id = "OdaxAI/DANTE-Mosaic-3.5B"
|
||||
tok = AutoTokenizer.from_pretrained(model_id)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, torch_dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
|
||||
prompt = "Solve step by step: if a train travels 120 km in 1.5 hours, what is its average speed?"
|
||||
inputs = tok(prompt, return_tensors="pt").to(model.device)
|
||||
out = model.generate(
|
||||
**inputs, max_new_tokens=256,
|
||||
do_sample=True, temperature=0.7, top_p=0.9,
|
||||
repetition_penalty=1.1, pad_token_id=tok.eos_token_id,
|
||||
)
|
||||
print(tok.decode(out[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Reproducibility
|
||||
|
||||
All evaluation scripts are in the open GitHub repo under `evaluation/`:
|
||||
|
||||
| Script | Covers |
|
||||
|---|---|
|
||||
| `run_lm_eval.sh` | MMLU, MMLU-Pro, GSM8K, ARC-C, HellaSwag |
|
||||
| `run_humaneval.sh` | HumanEval pass@1 |
|
||||
| `run_mbpp.sh` | MBPP pass@1 |
|
||||
|
||||
After a run: `python3 evaluation/parse_canonical_results.py`
|
||||
|
||||
---
|
||||
|
||||
## Limitations
|
||||
|
||||
- **HumanEval is low (6.7 %).** Algorithmic code is the clear target for the next cache iteration.
|
||||
- **Not SOTA** against mature instruct models on code or knowledge breadth.
|
||||
- **No alignment** (no DPO / RLHF). May generate harmful content proportionally to teacher outputs on adversarial prompts.
|
||||
- **No safety red-teaming** was performed.
|
||||
- Benchmarks are English-centric; multilingual capability comes from the ~11 % Italian cache fraction — measure separately for your locale.
|
||||
|
||||
---
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
@misc{odaxai2026dante,
|
||||
title = {DANTE-Mosaic-3.5B: Cross-Architecture Generative Distillation
|
||||
from a Trillion-Parameter MoE in 21 GPU-Hours},
|
||||
author = {{OdaxAI Research Team}},
|
||||
year = {2026},
|
||||
howpublished = {\url{https://huggingface.co/OdaxAI/DANTE-Mosaic-3.5B}},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
Compute: 8× **NVIDIA A100-class** GPUs on a public European HPC system.
|
||||
Open stack: Kimi K2, SmolLM3, HuggingFace Transformers, vLLM, lm-evaluation-harness v0.4.5, bigcode-evaluation-harness.
|
||||
|
||||
---
|
||||
|
||||
*OdaxAI provides this model "as is" for research. You are responsible for compliance with all applicable licenses when using or citing third-party benchmark numbers.*
|
||||
3
assets/capability_compute.png
Normal file
3
assets/capability_compute.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c7eea53e7975f9c13fa1dc6d79edd926aa0c74c2f17baa9292ed079ace76eda5
|
||||
size 300650
|
||||
3
assets/fig1_architecture.png
Normal file
3
assets/fig1_architecture.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d6222551beca2962ee48c602afa9fd656c4931115a4f87c34a56743058457ec9
|
||||
size 291199
|
||||
3
assets/fig1_pipeline.png
Normal file
3
assets/fig1_pipeline.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ad947816cd648fa381bade2d553339b38d038514edc4957d36c5b38ce7a95ed4
|
||||
size 245187
|
||||
3
assets/fig2_eval_status.png
Normal file
3
assets/fig2_eval_status.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:744a9659e65ca309cbe78c7e2c4876decdb2e4da63fcdb9337d0a0a34e4603a7
|
||||
size 225110
|
||||
3
assets/fig2_method.png
Normal file
3
assets/fig2_method.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d4241e328cf1624d25f0dcc2a6f484d1b1740dd80e67af1181a80864a91eb378
|
||||
size 350389
|
||||
3
assets/fig3_efficiency.png
Normal file
3
assets/fig3_efficiency.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:87b670e0181106162c19a3c07bd48ef4ea16967caefbe5e1d453d629a0252f74
|
||||
size 191503
|
||||
3
assets/fig3_training_loss.png
Normal file
3
assets/fig3_training_loss.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6b39192d9ba2cb840f1be62d28c4613350136808400884011abaea9d692e13cb
|
||||
size 181928
|
||||
3
assets/fig4_benchmarks.png
Normal file
3
assets/fig4_benchmarks.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:985ec96f659cb37eaedbfa6ced83dd3251e564dfeae75f8b42a153fc7ee1846e
|
||||
size 147758
|
||||
3
assets/fig4_compute_efficiency.png
Normal file
3
assets/fig4_compute_efficiency.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b0f708f2d2c67e19fe5b654cbfefd08cfd58a15e4f9d51957d8f14f38fa32a50
|
||||
size 143786
|
||||
3
assets/fig5_benchmark_bars.png
Normal file
3
assets/fig5_benchmark_bars.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:84c74e3d3f824332f0e112857126c3b0e9eb8b2be3e1ec9f177183ab63540a8d
|
||||
size 178199
|
||||
3
assets/fig6_arch_compare.png
Normal file
3
assets/fig6_arch_compare.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:152f7fdbe75738ce76a325266929f55d0a6c4b31724381604ff223f8e70028fe
|
||||
size 162488
|
||||
3
assets/fig6_throughput.png
Normal file
3
assets/fig6_throughput.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:de62744e7286d5abdbbf5ff1b739dfba2ba0691272e0f06bf4985db9d36a326c
|
||||
size 131003
|
||||
3
assets/fig7_full_suite.png
Normal file
3
assets/fig7_full_suite.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:feb368e7d4a2181c9a8667e144e3d1f48c5ed05c4e88e0cba06ba10dc3c7de5c
|
||||
size 178929
|
||||
3
assets/fig7_pipeline.png
Normal file
3
assets/fig7_pipeline.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a15f333815857f026343bb1bb93a69096d813cf507f22b75b8e284663eb503b0
|
||||
size 132992
|
||||
3
assets/fig_arch.png
Normal file
3
assets/fig_arch.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9747796fd8053c04ec85f58bb438a920509ddded211476b0804563c505a5bfac
|
||||
size 108023
|
||||
3
assets/fig_efficiency.png
Normal file
3
assets/fig_efficiency.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d7dbe084c6be6338d6378d37083d6306764304bf83521b02e27464857eee6514
|
||||
size 165351
|
||||
3
assets/fig_innovations.png
Normal file
3
assets/fig_innovations.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9e7c75556bf942c621caa57f76e6d11bf4f1329775c321ee66cc527fae5d5dfe
|
||||
size 132836
|
||||
3
assets/fig_method.png
Normal file
3
assets/fig_method.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:61c7b5e8719375f01ffe10285ddf275f47f04319e1b5ef2dd275e733c79c84ea
|
||||
size 515228
|
||||
3
assets/fig_pipeline.png
Normal file
3
assets/fig_pipeline.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ea7b1aa752fb05a7a4da045329a8b170d331c5be0f8be15f7ebe6f2b876f77ab
|
||||
size 148906
|
||||
3
assets/scoreboard.png
Normal file
3
assets/scoreboard.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:36ed985fc90cadd3c0cb95f12ff445d41817f7e1b200e83feeabbd6335e17231
|
||||
size 554138
|
||||
3
assets/v2_architecture.png
Normal file
3
assets/v2_architecture.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:08027a00c47f5dccb746f3430dd3bae6153d1afa99e69d2c741aa94fe868c957
|
||||
size 261579
|
||||
3
assets/v2_objective.png
Normal file
3
assets/v2_objective.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:021366ca6ef949ae85c6b019db4ccb48bb21ee4dfca5474a8d52ef808a52eb90
|
||||
size 216090
|
||||
3
assets/v2_pipeline.png
Normal file
3
assets/v2_pipeline.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1359a193f3458f6e8bd13b58340d5735c6da07101d9b389dac10317474822217
|
||||
size 235950
|
||||
3
assets/v2_pipeline_detail.png
Normal file
3
assets/v2_pipeline_detail.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e568dae998887c8495da5c3ff1a49a6fbd6287c1f947edf4b81005c7b6f7a4b0
|
||||
size 216218
|
||||
3
assets/v2_results_compute.png
Normal file
3
assets/v2_results_compute.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d665f5d75c2a1a3eb936845a6ba6b62656a001b570fbc78a5bab506d628c8fe0
|
||||
size 207250
|
||||
94
chat_template.jinja
Normal file
94
chat_template.jinja
Normal file
@@ -0,0 +1,94 @@
|
||||
{# ───── defaults ───── #}
|
||||
{%- if enable_thinking is not defined -%}
|
||||
{%- set enable_thinking = true -%}
|
||||
{%- endif -%}
|
||||
|
||||
{# ───── reasoning mode ───── #}
|
||||
{%- if enable_thinking -%}
|
||||
{%- set reasoning_mode = "/think" -%}
|
||||
{%- else -%}
|
||||
{%- set reasoning_mode = "/no_think" -%}
|
||||
{%- endif -%}
|
||||
|
||||
{# ───── header (system message) ───── #}
|
||||
{{- "<|im_start|>system\n" -}}
|
||||
|
||||
{%- if messages[0].role == "system" -%}
|
||||
{%- set system_message = messages[0].content -%}
|
||||
{%- if "/no_think" in system_message -%}
|
||||
{%- set reasoning_mode = "/no_think" -%}
|
||||
{%- elif "/think" in system_message -%}
|
||||
{%- set reasoning_mode = "/think" -%}
|
||||
{%- endif -%}
|
||||
{%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if "/system_override" in system_message -%}
|
||||
{{- custom_instructions.replace("/system_override", "").rstrip() -}}
|
||||
{{- "<|im_end|>\n" -}}
|
||||
{%- else -%}
|
||||
{{- "## Metadata\n\n" -}}
|
||||
{{- "Knowledge Cutoff Date: June 2025\n" -}}
|
||||
{%- set today = strftime_now("%d %B %Y") -%}
|
||||
{{- "Today Date: " ~ today ~ "\n" -}}
|
||||
{{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}}
|
||||
|
||||
{{- "## Custom Instructions\n\n" -}}
|
||||
{%- if custom_instructions -%}
|
||||
{{- custom_instructions + "\n\n" -}}
|
||||
{%- elif reasoning_mode == "/think" -%}
|
||||
{{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}}
|
||||
{%- else -%}
|
||||
{{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if xml_tools or python_tools or tools -%}
|
||||
{{- "### Tools\n\n" -}}
|
||||
{%- if xml_tools or tools -%}
|
||||
{%- if tools -%}
|
||||
{%- set xml_tools = tools -%}
|
||||
{%- endif -%}
|
||||
{%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
|
||||
{%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
|
||||
{%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
|
||||
{%- endfor -%}
|
||||
{%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
|
||||
{{- xml_tool_string -}}
|
||||
{%- endif -%}
|
||||
{%- if python_tools -%}
|
||||
{%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
|
||||
{%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
|
||||
{%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
|
||||
{%- endfor -%}
|
||||
{%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
|
||||
{{- python_tool_string -}}
|
||||
{%- endif -%}
|
||||
{{- "\n\n" -}}
|
||||
{{- "<|im_end|>\n" -}}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{# ───── main loop ───── #}
|
||||
{%- for message in messages -%}
|
||||
{%- set content = message.content if message.content is string else "" -%}
|
||||
{%- if message.role == "user" -%}
|
||||
{{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }}
|
||||
{%- elif message.role == "assistant" -%}
|
||||
{% generation %}
|
||||
{%- if reasoning_mode == "/think" -%}
|
||||
{{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }}
|
||||
{%- else -%}
|
||||
{{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" + content.lstrip("\n") + "<|im_end|>\n" }}
|
||||
{%- endif -%}
|
||||
{% endgeneration %}
|
||||
{%- elif message.role == "tool" -%}
|
||||
{{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{# ───── generation prompt ───── #}
|
||||
{%- if add_generation_prompt -%}
|
||||
{%- if reasoning_mode == "/think" -%}
|
||||
{{ "<|im_start|>assistant\n" }}
|
||||
{%- else -%}
|
||||
{{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" }}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
111
config.json
Normal file
111
config.json
Normal file
@@ -0,0 +1,111 @@
|
||||
{
|
||||
"architectures": [
|
||||
"SmolLM3ForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 128000,
|
||||
"dtype": "bfloat16",
|
||||
"eos_token_id": 128012,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 2048,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 11008,
|
||||
"layer_types": [
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention"
|
||||
],
|
||||
"max_position_embeddings": 65536,
|
||||
"max_window_layers": 28,
|
||||
"mlp_bias": false,
|
||||
"model_type": "smollm3",
|
||||
"no_rope_layer_interval": 4,
|
||||
"no_rope_layers": [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0
|
||||
],
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 36,
|
||||
"num_key_value_heads": 4,
|
||||
"pad_token_id": 128004,
|
||||
"pretraining_tp": 2,
|
||||
"rms_norm_eps": 1e-06,
|
||||
"rope_parameters": {
|
||||
"rope_theta": 5000000.0,
|
||||
"rope_type": "default"
|
||||
},
|
||||
"sliding_window": null,
|
||||
"tie_word_embeddings": true,
|
||||
"transformers_version": "5.8.0",
|
||||
"use_cache": false,
|
||||
"use_sliding_window": false,
|
||||
"vocab_size": 128256
|
||||
}
|
||||
163
evaluation/README.md
Normal file
163
evaluation/README.md
Normal file
@@ -0,0 +1,163 @@
|
||||
# DANTE-Mosaic-3.5B — Canonical Evaluation Suite
|
||||
|
||||
This folder contains all evaluation scripts for `OdaxAI/DANTE-Mosaic-3.5B`.
|
||||
|
||||
---
|
||||
|
||||
## Evaluation types
|
||||
|
||||
| Type | Description | Comparable to leaderboard? |
|
||||
|------|-------------|---------------------------|
|
||||
| **REAL_CANONICAL_RUN** | Run via lm-evaluation-harness or bigcode harness on full benchmark dataset | **Yes** |
|
||||
| **REAL_INTERNAL_SUBSET** | Run on 25–40 hand-curated problems in `scripts/_benchmark_dante_offline.py` | **No** |
|
||||
|
||||
These two types must **never be mixed** in the same table or chart.
|
||||
|
||||
---
|
||||
|
||||
## Canonical benchmarks (lm-evaluation-harness)
|
||||
|
||||
| Benchmark | Task name | N samples | Few-shot | Metric | Script |
|
||||
|-----------|-----------|-----------|----------|--------|--------|
|
||||
| MMLU | `mmlu` | 14,042 | 5 | `acc` | `slurm_lm_eval.slurm` |
|
||||
| MMLU-Pro | `mmlu_pro` | 4,500 | 5 | `acc` | `slurm_lm_eval.slurm` |
|
||||
| GSM8K | `gsm8k` | 1,319 | 8 | `exact_match,strict-match` | `slurm_lm_eval.slurm` |
|
||||
| ARC-Challenge | `arc_challenge` | 1,172 | 25 | `acc_norm` | `slurm_lm_eval.slurm` |
|
||||
| HellaSwag | `hellaswag` | 10,042 | 10 | `acc_norm` | `slurm_lm_eval.slurm` |
|
||||
| TruthfulQA MC2 | `truthfulqa_mc2` | 817 | 0 | `mc2` | `slurm_lm_eval.slurm` |
|
||||
| Winogrande | `winogrande` | 1,267 | 5 | `acc` | `slurm_lm_eval.slurm` |
|
||||
| IFEval | `ifeval` | 541 | 0 | `prompt_level_strict_acc` | `slurm_lm_eval.slurm` |
|
||||
|
||||
## Canonical benchmarks (bigcode-evaluation-harness)
|
||||
|
||||
| Benchmark | Task name | N samples | Few-shot | Metric | Script |
|
||||
|-----------|-----------|-----------|----------|--------|--------|
|
||||
| HumanEval | `humaneval` | 164 | 0 | `pass@1` | `slurm_code_eval.slurm` |
|
||||
| MBPP | `mbpp` | 374 | 0 | `pass@1` | `slurm_code_eval.slurm` |
|
||||
|
||||
---
|
||||
|
||||
## How to run on Leonardo Booster
|
||||
|
||||
### 1. Copy the repo to Leonardo
|
||||
```bash
|
||||
rsync -av /Users/nicolosavioli/Desktop/DANTE-T1-Seed/ \
|
||||
nsavioli@login.leonardo.cineca.it:/leonardo_scratch/large/userexternal/nsavioli/DANTE-T1-Seed/
|
||||
```
|
||||
|
||||
### 2. Submit all jobs with one command
|
||||
```bash
|
||||
# From the repo root on Leonardo:
|
||||
bash evaluation/submit_all.sh
|
||||
```
|
||||
|
||||
This submits two SLURM jobs:
|
||||
- `slurm_lm_eval.slurm` — all lm-eval tasks (~4–6 hours on 1× A100-40GB)
|
||||
- `slurm_code_eval.slurm` — HumanEval + MBPP (~1–2 hours on 1× A100-40GB)
|
||||
|
||||
### 3. Monitor jobs
|
||||
```bash
|
||||
squeue -u $USER
|
||||
tail -f evaluation/slurm_logs/lm_eval_<JOB_ID>.out
|
||||
tail -f evaluation/slurm_logs/code_eval_<JOB_ID>.out
|
||||
```
|
||||
|
||||
### 4. Parse results after completion
|
||||
```bash
|
||||
python3 evaluation/parse_canonical_results.py
|
||||
```
|
||||
|
||||
This produces:
|
||||
- `results/canonical/CANONICAL_SUMMARY.json` — all scores as JSON
|
||||
- `results/canonical/CANONICAL_PROVENANCE.md` — full provenance table (Markdown)
|
||||
- Printed summary table in terminal
|
||||
|
||||
### 5. Transfer results to your Mac
|
||||
```bash
|
||||
rsync -av nsavioli@login.leonardo.cineca.it:\
|
||||
/leonardo_scratch/large/userexternal/nsavioli/DANTE-T1-Seed/evaluation/results/canonical/ \
|
||||
evaluation/results/canonical/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Decoding and grading setup
|
||||
|
||||
All lm-eval runs use:
|
||||
|
||||
| Parameter | Value |
|
||||
|-----------|-------|
|
||||
| `temperature` | 0.0 (greedy) |
|
||||
| `do_sample` | false |
|
||||
| `batch_size` | 8 |
|
||||
| `seed` | 42 |
|
||||
| `precision` | bfloat16 |
|
||||
| `device` | CUDA (A100-40GB) |
|
||||
| `harness version` | lm-eval 0.4.5 |
|
||||
|
||||
All code eval runs use:
|
||||
|
||||
| Parameter | Value |
|
||||
|-----------|-------|
|
||||
| `temperature` | 0.0 (greedy) |
|
||||
| `n_samples` | 1 (pass@1) |
|
||||
| `batch_size` | 8 |
|
||||
| `precision` | bfloat16 |
|
||||
| `harness` | bigcode-evaluation-harness (latest main) |
|
||||
|
||||
---
|
||||
|
||||
## Environment variables
|
||||
|
||||
Both SLURM scripts set the following:
|
||||
|
||||
```bash
|
||||
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
|
||||
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
|
||||
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
|
||||
export HF_HUB_CACHE="${HF_HOME}/hub"
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Results folder structure
|
||||
|
||||
```
|
||||
evaluation/results/canonical/
|
||||
mmlu_<timestamp>.json # lm-eval output (raw)
|
||||
mmlu_pro_<timestamp>.json
|
||||
gsm8k_<timestamp>.json
|
||||
arc_challenge_<timestamp>.json
|
||||
hellaswag_<timestamp>.json
|
||||
truthfulqa_mc2_<timestamp>.json
|
||||
winogrande_<timestamp>.json
|
||||
ifeval_<timestamp>.json
|
||||
humaneval_<timestamp>/
|
||||
humaneval_generations.json
|
||||
humaneval_metrics.json
|
||||
mbpp_<timestamp>/
|
||||
mbpp_generations.json
|
||||
mbpp_metrics.json
|
||||
CANONICAL_SUMMARY.json # parsed summary (generated by parse_canonical_results.py)
|
||||
CANONICAL_PROVENANCE.md # provenance table (generated by parse_canonical_results.py)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Provenance table template
|
||||
|
||||
Once canonical results are available, `parse_canonical_results.py` will generate this automatically.
|
||||
The table will include: benchmark name, harness, task name, N samples, metric, decoding/few-shot, output JSON path, date, and hardware.
|
||||
|
||||
---
|
||||
|
||||
## Important: do not mix result types
|
||||
|
||||
Internal subset scores (from `scripts/_benchmark_dante_offline.py`, N=30/40/25)
|
||||
and canonical scores (from lm-eval / bigcode harness, full splits) use different
|
||||
protocols and must never appear in the same table without an explicit separator and
|
||||
label explaining the difference.
|
||||
|
||||
The parser script enforces this by saving canonical results only to `results/canonical/`
|
||||
and printing a clear header.
|
||||
212
evaluation/parse_canonical_results.py
Normal file
212
evaluation/parse_canonical_results.py
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
parse_canonical_results.py
|
||||
==========================
|
||||
Reads all lm-eval and bigcode JSON outputs from evaluation/results/canonical/
|
||||
and produces:
|
||||
1. A clean summary table printed to stdout
|
||||
2. evaluation/results/canonical/CANONICAL_SUMMARY.json
|
||||
3. evaluation/results/canonical/CANONICAL_PROVENANCE.md
|
||||
|
||||
Usage:
|
||||
python3 evaluation/parse_canonical_results.py
|
||||
python3 evaluation/parse_canonical_results.py --results-dir evaluation/results/canonical
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ─── Metric keys per task ────────────────────────────────────────────────────
|
||||
TASK_META = {
|
||||
"mmlu": {"label": "MMLU", "metric": "acc,none", "fewshot": 5, "n": 14042},
|
||||
"mmlu_pro": {"label": "MMLU-Pro", "metric": "acc,none", "fewshot": 5, "n": 4500},
|
||||
"gsm8k": {"label": "GSM8K", "metric": "exact_match,strict-match", "fewshot": 8, "n": 1319},
|
||||
"arc_challenge": {"label": "ARC-Challenge", "metric": "acc_norm,none", "fewshot": 25, "n": 1172},
|
||||
"hellaswag": {"label": "HellaSwag", "metric": "acc_norm,none", "fewshot": 10, "n": 10042},
|
||||
"truthfulqa_mc2": {"label": "TruthfulQA MC2", "metric": "mc2,none", "fewshot": 0, "n": 817},
|
||||
"winogrande": {"label": "Winogrande", "metric": "acc,none", "fewshot": 5, "n": 1267},
|
||||
"ifeval": {"label": "IFEval", "metric": "prompt_level_strict_acc,none", "fewshot": 0, "n": 541},
|
||||
}
|
||||
|
||||
CODE_TASKS = {
|
||||
"humaneval": {"label": "HumanEval", "metric": "pass@1", "fewshot": 0, "n": 164},
|
||||
"mbpp": {"label": "MBPP", "metric": "pass@1", "fewshot": 0, "n": 374},
|
||||
}
|
||||
|
||||
|
||||
def find_latest(pattern: str) -> str | None:
|
||||
files = sorted(glob.glob(pattern))
|
||||
return files[-1] if files else None
|
||||
|
||||
|
||||
def parse_lm_eval(results_dir: str) -> dict:
|
||||
scores = {}
|
||||
for task_key, meta in TASK_META.items():
|
||||
pattern = f"{results_dir}/{task_key}_*.json"
|
||||
latest = find_latest(pattern)
|
||||
if not latest:
|
||||
continue
|
||||
try:
|
||||
with open(latest) as f:
|
||||
data = json.load(f)
|
||||
res = data.get("results", {})
|
||||
if task_key in res:
|
||||
raw = res[task_key].get(meta["metric"])
|
||||
if raw is not None:
|
||||
mtime = os.path.getmtime(latest)
|
||||
scores[task_key] = {
|
||||
"label": meta["label"],
|
||||
"score": round(raw * 100, 2),
|
||||
"metric": meta["metric"],
|
||||
"fewshot": meta["fewshot"],
|
||||
"n": meta["n"],
|
||||
"source": latest,
|
||||
"date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
|
||||
"harness": "lm-evaluation-harness 0.4.5",
|
||||
"model": data.get("config", {}).get("model_args", "OdaxAI/DANTE-Mosaic-3.5B"),
|
||||
"dtype": "bfloat16",
|
||||
"device": "NVIDIA A100-40GB",
|
||||
"seed": 42,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr)
|
||||
return scores
|
||||
|
||||
|
||||
def parse_code_eval(results_dir: str) -> dict:
|
||||
scores = {}
|
||||
for task_key, meta in CODE_TASKS.items():
|
||||
# bigcode saves to subdir
|
||||
pattern = f"{results_dir}/{task_key}_*/{task_key}_metrics.json"
|
||||
latest = find_latest(pattern)
|
||||
if not latest:
|
||||
continue
|
||||
try:
|
||||
with open(latest) as f:
|
||||
data = json.load(f)
|
||||
raw = data.get("pass@1")
|
||||
if raw is not None:
|
||||
mtime = os.path.getmtime(latest)
|
||||
scores[task_key] = {
|
||||
"label": meta["label"],
|
||||
"score": round(raw * 100, 2),
|
||||
"metric": "pass@1",
|
||||
"fewshot": 0,
|
||||
"n": meta["n"],
|
||||
"source": latest,
|
||||
"date": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d"),
|
||||
"harness": "bigcode-evaluation-harness",
|
||||
"model": "OdaxAI/DANTE-Mosaic-3.5B",
|
||||
"dtype": "bfloat16",
|
||||
"device": "NVIDIA A100-40GB",
|
||||
"seed": 0,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f" [WARNING] parse error {latest}: {e}", file=sys.stderr)
|
||||
return scores
|
||||
|
||||
|
||||
def print_table(all_scores: dict) -> None:
|
||||
SEP = "=" * 78
|
||||
print(f"\n{SEP}")
|
||||
print(" CANONICAL BENCHMARK RESULTS — OdaxAI/DANTE-Mosaic-3.5B")
|
||||
print(" All results produced by official evaluation harnesses on Leonardo HPC")
|
||||
print(f"{SEP}")
|
||||
print(f" {'Benchmark':<20} {'N':>6} {'Few-shot':>8} {'Metric':<20} {'Score':>8}")
|
||||
print(" " + "-" * 72)
|
||||
for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
|
||||
print(f" {v['label']:<20} {v['n']:>6} {v['fewshot']:>8}-shot "
|
||||
f"{v['metric']:<20} {v['score']:>7.2f}%")
|
||||
print(f"{SEP}\n")
|
||||
|
||||
|
||||
def write_summary(all_scores: dict, results_dir: str) -> None:
|
||||
out = {
|
||||
"model": "OdaxAI/DANTE-Mosaic-3.5B",
|
||||
"type": "REAL_CANONICAL_RUN",
|
||||
"harness": "lm-evaluation-harness 0.4.5 + bigcode-evaluation-harness",
|
||||
"hardware": "NVIDIA A100-40GB, BF16",
|
||||
"cluster": "CINECA Leonardo Booster",
|
||||
"seed": 42,
|
||||
"results": all_scores,
|
||||
}
|
||||
path = f"{results_dir}/CANONICAL_SUMMARY.json"
|
||||
with open(path, "w") as f:
|
||||
json.dump(out, f, indent=2)
|
||||
print(f" Summary JSON -> {path}")
|
||||
|
||||
|
||||
def write_provenance(all_scores: dict, results_dir: str) -> None:
|
||||
lines = [
|
||||
"# Canonical Benchmark Provenance — DANTE-Mosaic-3.5B",
|
||||
"",
|
||||
"All results in this table are **REAL_CANONICAL_RUN** — produced by official",
|
||||
"evaluation harnesses on the `OdaxAI/DANTE-Mosaic-3.5B` checkpoint.",
|
||||
"They are directly comparable to published leaderboard scores.",
|
||||
"",
|
||||
"| Benchmark | Harness | Task name | N | Few-shot | Metric | Score | Date | Source JSON |",
|
||||
"|-----------|---------|-----------|---|----------|--------|-------|------|-------------|",
|
||||
]
|
||||
for k, v in sorted(all_scores.items(), key=lambda x: x[1]["label"]):
|
||||
src = os.path.basename(v["source"])
|
||||
lines.append(
|
||||
f"| {v['label']} | {v['harness']} | `{k}` | {v['n']} | "
|
||||
f"{v['fewshot']}-shot | `{v['metric']}` | **{v['score']:.2f}%** | "
|
||||
f"{v['date']} | `{src}` |"
|
||||
)
|
||||
lines += [
|
||||
"",
|
||||
"## Hardware & Software",
|
||||
"",
|
||||
"| Property | Value |",
|
||||
"|----------|-------|",
|
||||
"| GPU | NVIDIA A100-SXM-40GB |",
|
||||
"| Precision | BF16 |",
|
||||
"| Cluster | CINECA Leonardo Booster |",
|
||||
"| lm-eval version | 0.4.5 |",
|
||||
"| Seed | 42 |",
|
||||
"",
|
||||
"## Comparability Note",
|
||||
"",
|
||||
"These canonical scores are produced under standard protocols and are directly",
|
||||
"comparable to published scores from the same harness versions.",
|
||||
"Internal offline subset scores (30/40/25 problems from `_benchmark_dante_offline.py`)",
|
||||
"are **separate** and must not be mixed with these canonical results.",
|
||||
]
|
||||
path = f"{results_dir}/CANONICAL_PROVENANCE.md"
|
||||
with open(path, "w") as f:
|
||||
f.write("\n".join(lines) + "\n")
|
||||
print(f" Provenance -> {path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--results-dir", default="evaluation/results/canonical")
|
||||
args = parser.parse_args()
|
||||
|
||||
rd = args.results_dir
|
||||
print(f"\nParsing results from: {rd}")
|
||||
|
||||
lm = parse_lm_eval(rd)
|
||||
code = parse_code_eval(rd)
|
||||
all_scores = {**lm, **code}
|
||||
|
||||
if not all_scores:
|
||||
print("No canonical results found yet.")
|
||||
print("Run evaluation/slurm_lm_eval.slurm and slurm_code_eval.slurm on Leonardo first.")
|
||||
sys.exit(0)
|
||||
|
||||
print_table(all_scores)
|
||||
write_summary(all_scores, rd)
|
||||
write_provenance(all_scores, rd)
|
||||
print("\nDone.\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
131
evaluation/slurm_code_eval.slurm
Normal file
131
evaluation/slurm_code_eval.slurm
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=dante_code_eval
|
||||
#SBATCH --account=AIFAC_F02_254_0
|
||||
#SBATCH --partition=boost_usr_prod
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --gpus-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --time=04:00:00
|
||||
#SBATCH --output=evaluation/slurm_logs/code_eval_%j.out
|
||||
#SBATCH --error=evaluation/slurm_logs/code_eval_%j.err
|
||||
|
||||
# ============================================================================
|
||||
# DANTE-Mosaic-3.5B — Canonical code evaluation
|
||||
# Leonardo Booster, 1x A100-40GB
|
||||
#
|
||||
# Tasks: HumanEval (pass@1), MBPP (pass@1)
|
||||
# Harness: bigcode-evaluation-harness
|
||||
# All outputs -> evaluation/results/canonical/
|
||||
#
|
||||
# WARNING: HumanEval executes generated Python code.
|
||||
# bigcode-evaluation-harness uses a sandboxed subprocess per sample.
|
||||
# Do NOT run --allow_code_execution outside of a secure environment.
|
||||
# Leonardo compute nodes are isolated — this is acceptable here.
|
||||
# ============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ─── Environment ─────────────────────────────────────────────────────────────
|
||||
module purge
|
||||
module load cuda/12.4 python/3.11.7
|
||||
|
||||
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
|
||||
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
|
||||
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
|
||||
export HF_HUB_CACHE="${HF_HOME}/hub"
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# ─── Config ──────────────────────────────────────────────────────────────────
|
||||
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
RESULTS="evaluation/results/canonical"
|
||||
BIGCODE_DIR="/leonardo_scratch/large/userexternal/nsavioli/bigcode-evaluation-harness"
|
||||
mkdir -p "${RESULTS}"
|
||||
|
||||
echo "======================================="
|
||||
echo "DANTE-Mosaic-3.5B — Canonical code eval"
|
||||
echo "Model: ${MODEL}"
|
||||
echo "Job ID: ${SLURM_JOB_ID}"
|
||||
echo "Timestamp: ${TIMESTAMP}"
|
||||
echo "======================================="
|
||||
|
||||
# ─── Install bigcode harness if not present ───────────────────────────────────
|
||||
if [ ! -d "${BIGCODE_DIR}" ]; then
|
||||
echo "Cloning bigcode-evaluation-harness..."
|
||||
git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git \
|
||||
"${BIGCODE_DIR}"
|
||||
pip install --quiet -e "${BIGCODE_DIR}"
|
||||
else
|
||||
echo "bigcode-evaluation-harness found at ${BIGCODE_DIR}"
|
||||
pip install --quiet -e "${BIGCODE_DIR}"
|
||||
fi
|
||||
|
||||
# ─── HumanEval (pass@1, 164 problems, greedy, 0-shot) ────────────────────────
|
||||
echo ""
|
||||
echo ">>> HumanEval pass@1 (164 problems, greedy decoding)..."
|
||||
HE_OUT="${RESULTS}/humaneval_${TIMESTAMP}"
|
||||
mkdir -p "${HE_OUT}"
|
||||
|
||||
accelerate launch "${BIGCODE_DIR}/main.py" \
|
||||
--model "${MODEL}" \
|
||||
--tasks humaneval \
|
||||
--do_sample False \
|
||||
--temperature 0.0 \
|
||||
--n_samples 1 \
|
||||
--batch_size 8 \
|
||||
--allow_code_execution \
|
||||
--precision bf16 \
|
||||
--trust_remote_code \
|
||||
--save_generations \
|
||||
--save_generations_path "${HE_OUT}/humaneval_generations.json" \
|
||||
--metric_output_path "${HE_OUT}/humaneval_metrics.json" \
|
||||
2>&1 | tee "${HE_OUT}/humaneval.log"
|
||||
|
||||
echo ">>> HumanEval done -> ${HE_OUT}/humaneval_metrics.json"
|
||||
|
||||
# ─── MBPP (pass@1, 374 problems, greedy, 0-shot) ─────────────────────────────
|
||||
echo ""
|
||||
echo ">>> MBPP pass@1 (374 problems, greedy decoding)..."
|
||||
MBPP_OUT="${RESULTS}/mbpp_${TIMESTAMP}"
|
||||
mkdir -p "${MBPP_OUT}"
|
||||
|
||||
accelerate launch "${BIGCODE_DIR}/main.py" \
|
||||
--model "${MODEL}" \
|
||||
--tasks mbpp \
|
||||
--do_sample False \
|
||||
--temperature 0.0 \
|
||||
--n_samples 1 \
|
||||
--batch_size 8 \
|
||||
--allow_code_execution \
|
||||
--precision bf16 \
|
||||
--trust_remote_code \
|
||||
--save_generations \
|
||||
--save_generations_path "${MBPP_OUT}/mbpp_generations.json" \
|
||||
--metric_output_path "${MBPP_OUT}/mbpp_metrics.json" \
|
||||
2>&1 | tee "${MBPP_OUT}/mbpp.log"
|
||||
|
||||
echo ">>> MBPP done -> ${MBPP_OUT}/mbpp_metrics.json"
|
||||
|
||||
# ─── Summary ─────────────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo "======================================="
|
||||
echo "CODE EVAL COMPLETE"
|
||||
python3 - <<'PYEOF'
|
||||
import json, glob
|
||||
|
||||
for label, pat in [
|
||||
("HumanEval", "evaluation/results/canonical/humaneval_*/humaneval_metrics.json"),
|
||||
("MBPP", "evaluation/results/canonical/mbpp_*/mbpp_metrics.json"),
|
||||
]:
|
||||
files = sorted(glob.glob(pat))
|
||||
if files:
|
||||
with open(files[-1]) as f:
|
||||
d = json.load(f)
|
||||
score = d.get("pass@1", d)
|
||||
print(f" {label}: pass@1 = {score}")
|
||||
else:
|
||||
print(f" {label}: no result file found")
|
||||
PYEOF
|
||||
echo "======================================="
|
||||
156
evaluation/slurm_lm_eval.slurm
Normal file
156
evaluation/slurm_lm_eval.slurm
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=dante_lm_eval
|
||||
#SBATCH --account=AIFAC_F02_254_0
|
||||
#SBATCH --partition=boost_usr_prod
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --gpus-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --time=08:00:00
|
||||
#SBATCH --output=evaluation/slurm_logs/lm_eval_%j.out
|
||||
#SBATCH --error=evaluation/slurm_logs/lm_eval_%j.err
|
||||
|
||||
# ============================================================================
|
||||
# DANTE-Mosaic-3.5B — Canonical lm-evaluation-harness benchmark
|
||||
# Leonardo Booster, 1x A100-40GB
|
||||
#
|
||||
# Tasks: MMLU, GSM8K, ARC-Challenge, HellaSwag, TruthfulQA, Winogrande, IFEval
|
||||
# Harness: EleutherAI lm-evaluation-harness 0.4.5
|
||||
# All outputs -> evaluation/results/canonical/
|
||||
# ============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ─── Environment ─────────────────────────────────────────────────────────────
|
||||
module purge
|
||||
module load cuda/12.4 python/3.11.7
|
||||
|
||||
export HF_HOME="/leonardo_scratch/large/userexternal/nsavioli/hf_cache"
|
||||
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
|
||||
export TRANSFORMERS_CACHE="${HF_HOME}/transformers"
|
||||
export HF_HUB_CACHE="${HF_HOME}/hub"
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# ─── Config ──────────────────────────────────────────────────────────────────
|
||||
MODEL="OdaxAI/DANTE-Mosaic-3.5B"
|
||||
MODEL_ARGS="pretrained=${MODEL},dtype=bfloat16,trust_remote_code=True"
|
||||
BATCH=8
|
||||
SEED=42
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
RESULTS="evaluation/results/canonical"
|
||||
mkdir -p "${RESULTS}"
|
||||
|
||||
echo "======================================="
|
||||
echo "DANTE-Mosaic-3.5B — Canonical lm-eval"
|
||||
echo "Model: ${MODEL}"
|
||||
echo "Job ID: ${SLURM_JOB_ID}"
|
||||
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
|
||||
echo "Timestamp: ${TIMESTAMP}"
|
||||
echo "======================================="
|
||||
|
||||
# ─── Install harness if needed ────────────────────────────────────────────────
|
||||
pip install --quiet lm-eval==0.4.5
|
||||
|
||||
# ─── Helper: run one task ─────────────────────────────────────────────────────
|
||||
run_task() {
|
||||
local TASK=$1
|
||||
local FEWSHOT=$2
|
||||
local EXTRA="${3:-}"
|
||||
local OUT="${RESULTS}/${TASK}_${TIMESTAMP}.json"
|
||||
|
||||
echo ""
|
||||
echo ">>> ${TASK} (${FEWSHOT}-shot) ..."
|
||||
lm_eval \
|
||||
--model hf \
|
||||
--model_args "${MODEL_ARGS}" \
|
||||
--tasks "${TASK}" \
|
||||
--num_fewshot "${FEWSHOT}" \
|
||||
--batch_size "${BATCH}" \
|
||||
--seed "${SEED}" \
|
||||
--output_path "${OUT}" \
|
||||
--log_samples \
|
||||
${EXTRA} \
|
||||
2>&1 | tee "${RESULTS}/${TASK}_${TIMESTAMP}.log"
|
||||
echo ">>> DONE ${TASK} -> ${OUT}"
|
||||
}
|
||||
|
||||
# ─── Run all canonical tasks ──────────────────────────────────────────────────
|
||||
# MMLU: 57 subjects, 5-shot, accuracy
|
||||
run_task "mmlu" 5
|
||||
|
||||
# MMLU-Pro: harder 10-option MCQ, 5-shot
|
||||
run_task "mmlu_pro" 5
|
||||
|
||||
# GSM8K: 8-shot chain-of-thought, exact match on final answer
|
||||
run_task "gsm8k" 8
|
||||
|
||||
# ARC-Challenge: 25-shot, normalised accuracy
|
||||
run_task "arc_challenge" 25
|
||||
|
||||
# HellaSwag: 10-shot, normalised accuracy
|
||||
run_task "hellaswag" 10
|
||||
|
||||
# TruthfulQA MC2: 0-shot, mc2 multiple true answers
|
||||
run_task "truthfulqa_mc2" 0
|
||||
|
||||
# Winogrande: 5-shot, accuracy
|
||||
run_task "winogrande" 5
|
||||
|
||||
# IFEval: 0-shot, instruction-level strict accuracy
|
||||
run_task "ifeval" 0
|
||||
|
||||
# ─── Summary ─────────────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo "======================================="
|
||||
echo "ALL TASKS COMPLETE"
|
||||
echo "Results saved to: ${RESULTS}/"
|
||||
ls -lh "${RESULTS}/"
|
||||
echo "======================================="
|
||||
|
||||
# ─── Parse and print summary ─────────────────────────────────────────────────
|
||||
python3 - <<'PYEOF'
|
||||
import json, glob, os, sys
|
||||
|
||||
results_dir = "evaluation/results/canonical"
|
||||
files = sorted(glob.glob(f"{results_dir}/*.json"))
|
||||
if not files:
|
||||
print("No result JSON files found.")
|
||||
sys.exit(0)
|
||||
|
||||
# One score per task
|
||||
METRIC_MAP = {
|
||||
"mmlu": ("acc,none", "MMLU"),
|
||||
"mmlu_pro": ("acc,none", "MMLU-Pro"),
|
||||
"gsm8k": ("exact_match,strict-match", "GSM8K"),
|
||||
"arc_challenge": ("acc_norm,none", "ARC-Challenge"),
|
||||
"hellaswag": ("acc_norm,none", "HellaSwag"),
|
||||
"truthfulqa_mc2": ("mc2,none", "TruthfulQA"),
|
||||
"winogrande": ("acc,none", "Winogrande"),
|
||||
"ifeval": ("prompt_level_strict_acc,none", "IFEval"),
|
||||
}
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(" CANONICAL BENCHMARK RESULTS — DANTE-Mosaic-3.5B")
|
||||
print(" lm-evaluation-harness 0.4.5 | BF16 | A100-40GB | seed=42")
|
||||
print("="*60)
|
||||
print(f" {'Benchmark':<20} {'Metric':<35} {'Score':>8}")
|
||||
print(" " + "-"*58)
|
||||
|
||||
for f in files:
|
||||
try:
|
||||
with open(f) as fh:
|
||||
data = json.load(fh)
|
||||
results = data.get("results", {})
|
||||
for task_key, (metric_key, label) in METRIC_MAP.items():
|
||||
if task_key in results:
|
||||
score = results[task_key].get(metric_key, None)
|
||||
if score is not None:
|
||||
print(f" {label:<20} {metric_key:<35} {score*100:>7.2f}%")
|
||||
except Exception as e:
|
||||
print(f" [parse error: {f}: {e}]")
|
||||
|
||||
print("="*60)
|
||||
print(f" Source: {results_dir}/")
|
||||
print("="*60 + "\n")
|
||||
PYEOF
|
||||
33
evaluation/submit_all.sh
Normal file
33
evaluation/submit_all.sh
Normal file
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
# ============================================================================
|
||||
# submit_all.sh — Submit all canonical evaluation jobs on Leonardo
|
||||
#
|
||||
# Usage (from the repo root on Leonardo):
|
||||
# bash evaluation/submit_all.sh
|
||||
# ============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")/.." # repo root
|
||||
|
||||
mkdir -p evaluation/slurm_logs evaluation/results/canonical
|
||||
|
||||
echo "=== Submitting DANTE-Mosaic-3.5B canonical evaluations ==="
|
||||
echo ""
|
||||
|
||||
LM_JOB=$(sbatch --parsable evaluation/slurm_lm_eval.slurm)
|
||||
echo " lm-eval job submitted: ${LM_JOB}"
|
||||
|
||||
CODE_JOB=$(sbatch --parsable evaluation/slurm_code_eval.slurm)
|
||||
echo " code-eval job submitted: ${CODE_JOB}"
|
||||
|
||||
echo ""
|
||||
echo "Monitor:"
|
||||
echo " squeue -u \$USER"
|
||||
echo " tail -f evaluation/slurm_logs/lm_eval_${LM_JOB}.out"
|
||||
echo " tail -f evaluation/slurm_logs/code_eval_${CODE_JOB}.out"
|
||||
echo ""
|
||||
echo "After completion, parse results:"
|
||||
echo " python3 evaluation/parse_canonical_results.py"
|
||||
echo ""
|
||||
echo "Results will be saved to: evaluation/results/canonical/"
|
||||
73
example_inference.py
Normal file
73
example_inference.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""
|
||||
Example inference for DANTE-Mosaic-3.5B.
|
||||
|
||||
Usage:
|
||||
python example_inference.py
|
||||
python example_inference.py --model YourOrg/DANTE-Mosaic-3.5B
|
||||
python example_inference.py --model ./local_path/
|
||||
|
||||
Run on a single A100 / RTX 4090 / H100. ~5.8 GB VRAM in BF16.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import time
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
PROMPTS = [
|
||||
("MATH", "What is the derivative of f(x) = x^3 + 2x^2 - 5x + 1? Show step by step."),
|
||||
("CODE", "Write a Python function that checks if a string is a palindrome. Include a docstring and edge cases."),
|
||||
("LOGIC", "A bat and a ball cost $1.10 in total. The bat costs $1.00 more than the ball. How much does the ball cost? Explain."),
|
||||
("ITA", "Spiega cos'è il machine learning in termini semplici, adatti a uno studente delle superiori."),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--model", default="./",
|
||||
help="HF repo id or local path to the model directory")
|
||||
p.add_argument("--max-new-tokens", type=int, default=256)
|
||||
p.add_argument("--temperature", type=float, default=0.7)
|
||||
p.add_argument("--top-p", type=float, default=0.9)
|
||||
args = p.parse_args()
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
print(f"Loading {args.model} on {device} ...")
|
||||
t0 = time.time()
|
||||
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
).eval()
|
||||
print(f"Loaded in {time.time()-t0:.1f}s "
|
||||
f"({sum(p.numel() for p in model.parameters())/1e9:.2f}B params)\n")
|
||||
|
||||
for tag, prompt in PROMPTS:
|
||||
print("─" * 60)
|
||||
print(f"[{tag}] {prompt}\n")
|
||||
inputs = tok(prompt, return_tensors="pt").to(model.device)
|
||||
plen = inputs["input_ids"].shape[-1]
|
||||
t0 = time.time()
|
||||
with torch.no_grad():
|
||||
out = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=args.max_new_tokens,
|
||||
do_sample=True,
|
||||
temperature=args.temperature,
|
||||
top_p=args.top_p,
|
||||
repetition_penalty=1.1,
|
||||
pad_token_id=tok.eos_token_id,
|
||||
)
|
||||
new_toks = out.shape[-1] - plen
|
||||
elapsed = time.time() - t0
|
||||
text = tok.decode(out[0][plen:], skip_special_tokens=True).strip()
|
||||
print(text)
|
||||
print(f"\n [{new_toks} tokens in {elapsed:.1f}s — {new_toks/elapsed:.1f} tok/s]\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
9
generation_config.json
Normal file
9
generation_config.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"bos_token_id": 128000,
|
||||
"do_sample": true,
|
||||
"eos_token_id": 128012,
|
||||
"pad_token_id": 128004,
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.95,
|
||||
"transformers_version": "5.8.0"
|
||||
}
|
||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4af9401c4cc5f51d601bc7167694a4413ba99cdf69614ad0cd1ea75b7815a243
|
||||
size 6150235096
|
||||
95
push_to_hub.py
Normal file
95
push_to_hub.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""
|
||||
Push DANTE-Mosaic-3.5B to the Hugging Face Hub.
|
||||
|
||||
Prerequisites:
|
||||
pip install -U huggingface_hub transformers
|
||||
huggingface-cli login # interactive token entry, OR
|
||||
export HF_TOKEN=hf_xxxxxxxxxx # non-interactive
|
||||
|
||||
Usage:
|
||||
python push_to_hub.py \\
|
||||
--model-dir ../hf_model/ \\
|
||||
--repo YourOrg/DANTE-Mosaic-3.5B \\
|
||||
--private # optional, default public
|
||||
|
||||
Note: requires ~6 GB upload bandwidth. The huggingface_hub uploader resumes on
|
||||
network errors, so re-running is safe.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from huggingface_hub import HfApi, create_repo, upload_folder
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--model-dir", required=True,
|
||||
help="Path containing model.safetensors, config.json, tokenizer files")
|
||||
p.add_argument("--repo", required=True,
|
||||
help="HuggingFace repo id, e.g. 'YourOrg/DANTE-Mosaic-3.5B'")
|
||||
p.add_argument("--private", action="store_true",
|
||||
help="Create as private repo (default: public)")
|
||||
p.add_argument("--readme", default="README.md",
|
||||
help="Path to model card (will be uploaded as README.md)")
|
||||
p.add_argument("--token", default=None,
|
||||
help="HF token (overrides $HF_TOKEN). Optional.")
|
||||
p.add_argument("--commit-message", default="Initial release of DANTE-Mosaic-3.5B")
|
||||
p.add_argument("--allow-patterns", nargs="*", default=None,
|
||||
help="Limit upload to these glob patterns (e.g. '*.safetensors').")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
token = args.token or os.environ.get("HF_TOKEN")
|
||||
if not token:
|
||||
print("WARNING: no --token and no $HF_TOKEN; relying on cached HF login.")
|
||||
|
||||
model_dir = Path(args.model_dir).expanduser().resolve()
|
||||
if not model_dir.exists():
|
||||
raise FileNotFoundError(f"Model dir not found: {model_dir}")
|
||||
weight_files = list(model_dir.glob("*.safetensors")) + list(model_dir.glob("*.bin"))
|
||||
if not weight_files:
|
||||
raise FileNotFoundError(f"No weight files in {model_dir}")
|
||||
print(f"\nModel directory : {model_dir}")
|
||||
print(f"Weight files : {len(weight_files)} "
|
||||
f"(total {sum(f.stat().st_size for f in weight_files)/1e9:.1f} GB)")
|
||||
|
||||
readme_path = Path(args.readme)
|
||||
if readme_path.exists():
|
||||
target = model_dir / "README.md"
|
||||
if target.exists() and target.resolve() != readme_path.resolve():
|
||||
print(f"Replacing existing model card: {target}")
|
||||
if target.resolve() != readme_path.resolve():
|
||||
shutil.copy(readme_path, target)
|
||||
print(f"Model card : {readme_path} -> {target}")
|
||||
else:
|
||||
print(f"WARNING: README {readme_path} not found; the upload will not include a model card.")
|
||||
|
||||
print(f"Target repo : {args.repo} (private={args.private})")
|
||||
print()
|
||||
|
||||
print("Creating repo on the Hub (if it does not exist)...")
|
||||
create_repo(repo_id=args.repo, private=args.private, exist_ok=True, token=token)
|
||||
|
||||
print(f"Uploading folder...")
|
||||
upload_folder(
|
||||
folder_path=str(model_dir),
|
||||
repo_id=args.repo,
|
||||
commit_message=args.commit_message,
|
||||
token=token,
|
||||
allow_patterns=args.allow_patterns,
|
||||
)
|
||||
|
||||
print("\nDONE!")
|
||||
print(f"Model is live at: https://huggingface.co/{args.repo}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:7b6a500b662a34eb3f0374db856ba4ad7de4c81040571d78dc0d357238930005
|
||||
size 17208819
|
||||
12
tokenizer_config.json
Normal file
12
tokenizer_config.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"tokenizer_class": "PreTrainedTokenizerFast",
|
||||
"bos_token": null,
|
||||
"eos_token": "<|im_end|>",
|
||||
"pad_token": "<|im_end|>",
|
||||
"clean_up_tokenization_spaces": true,
|
||||
"model_input_names": [
|
||||
"input_ids",
|
||||
"attention_mask"
|
||||
],
|
||||
"model_max_length": 131072
|
||||
}
|
||||
Reference in New Issue
Block a user