commit 9f76a9e9f3467d071c063044e0cba5a9264f772f Author: ModelHub XC Date: Wed May 13 02:58:32 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: PleIAs/Baguettotron Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..98a1fbc --- /dev/null +++ b/.gitattributes @@ -0,0 +1,50 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..1a4a728 --- /dev/null +++ b/README.md @@ -0,0 +1,168 @@ +--- +language: +- en +- fr +- it +- de +- es +- pl +license: apache-2.0 +pipeline_tag: text-generation +tags: +- transformers +library_name: transformers +datasets: +- PleIAs/SYNTH +--- + +# 🥖 Baguettotron + +
+ Pleias +
+ +

+ Blog announcement +

+ +**Baguettotron** is a 321 million parameters generalist Small Reasoning Model, trained on 200 billions tokens from SYNTH, a fully open generalist dataset. + +Despite being trained on consideraly less data, Baguettotron outperforms most SLM of the same size range on non-code industry benchmarks, providing an unprecedented balance between memory, general reasoning, math and retrieval performance. + +

+ +

+ +The name is both a nod to French origins and to the unusual shape of the model: with 80 layers, Baguettotron is currently the deepest SLM in its size range. + +## Features +Baguettron has been natively trained for instructions with thinking traces. We implemented a series of dedicated pipelines for: +* Memorization of encyclopedic knowledge (50,000 vital articles from Wikipedia) +* Retrieval-Augmented Generation with grounding (following on our initial experiments with Pleias-RAG series) +* Arithmetic and simple math resolution problem +* Editing tasks +* Information extraction +* Creative writing, including unusual synthetic exercises like lipograms or layout poems. +* Cooking (the model wouldn't deserve its name otherwise) + +Baguettotron is able to read and write in the main European languages: French, German, Italian, Spanish, Polish and, to a lesser extent Latin and Dutch. Reasoning traces are exclusively written in English. + +Full synthetic training makes relatively straightforward to expand language support and we lookg forward to either bring more languages or create language-specific variants. + +## Model design and training +Baguettotron is a 321M parameters decoders with a standard Qwen/Llama-like design, except for extreme depth with 80 layers (a type of model we internally nicknamed "baguette") +

+ +

+ +Baguettotron was trained on 16 h100 from Jean Zay (compute plan n°A0191016886). An unusual feature of training on SYNTH was having reasoning signals from MMLU and other major industry benchmarks very early on. We were able to empirically measure consistent improvements from stacking more layers. + +

+ +

+ +Our current hypothesis is that deeper architecture benefits more from dense reasoning data, as the model is more commonly exposed to string sequences requiring intensive computation or knowledge interconnection. + +## Reasoning style +The reasoning traces use an entirely new reasoning style with dense, short frequently non-verbal sentences, designed by Pleias and made possible thanks to the use of fine-tuning models for synthetic generation. + +Traces use the following stenographic notation integrated into the special tokens of the model: + +### Logical markers + +| Token | Meaning | Usage | +| ----- | ---------------------------------- | ---------------------------------------------------------- | +| **→** | derivation / implication | For very short causal/logical flow | +| **↺** | iterative return / refinement loop | For backtracking, reconsidering priors, RAG re-querying. | +| **?** | uncertainty/questions to resolve | Could be appended to short expressions/word, not just interrogative sentences | +| **!/※** | insight/breakthroughs | Emphatic mark for knowledge discovery | +| **≈** | approximation/estimates | For intermediary hypothesis/uncertain preliminary statements | +| **∴** | therefore / final step | Use sparingly to mark stable conclusions. | + +### Uncertainty + +| Token | Meaning | Usage | +| ----- | ------------------------- | ------------------------------------------------------------- | +| **●** | high confidence | well-supported empirical/theoretical ground; “anchor points.” | +| **◐** | medium/partial confidence | incomplete data; plausible but unverified links. | +| **○** | low confidence | speculation, missing context, weak inference chain. | +| **⚠** | bias/premise risk | domain mismatch, cultural assumptions, language-switch artifacts. | +| **?maybe?** | soft speculation | marks tentative ideas, reasoning branches that might collapse later | + +### Verification process + +| Token | Meaning | Usage | +| ----- | ------------------------- | ---------------------------------------- | +| **☐** | unverified hypothesis | raw claim, no cross-check yet. | +| **☑** | intermediate verification | one source/argument supports it. | +| **✓** | confirmed/validated | multiple independent supports (●-level). | + +The model can also use a vareity of graphic notation for causality/problem decomposition at time. Things like: + +``` +Initial query: +├─ feature1: *lorem ipsum* +├─ feature2: *lorem ipsum* +└─ feature2: *lorem ipsum* +``` + +### Simulated entropy + +Baguettotron uses a range of special tokens **⟨H≈X.X⟩** to introduce higher entropy sequences, a bit similarly to temperature control. +* **⟨H≈0.3–0.5⟩**: still grounded sequences with a slightly higher token entropy +* **⟨H≈0.5–1.0⟩**: exploratory, multi-path reasoning +* **⟨H≈1.5–1.8⟩**: fragmented, oniric, literary stream-of-consciousness drift + +It remains a pure simulation since the model does obviously not have access to inference controls. Yet it still allows for more token exploration/diversification. Inspiration from this method came from the Entropix project. + +## Evaluation + +We evaluated Baguettotron on three major industry benchmarks MMLU (general reasoning and memorization), math (gsm8k) and retrieval (HotPotQA). With only 321M parameters, Baguettotron gets close to Qwen-0.6B performance and significantly outperforms similarly sized Gemma. + +

+ +

+ +## Inference +Baguettotron has been trained on the standard instruction style from Qwen. + +```xml +<|im_start|>user +Who are you?<|im_end|> +<|im_start|>assistant + +``` + +Baguettotron has support for multi-turn. We recommend to use a "rolling" thinking, by systematically appending thinking traces for each new generation but discarding the past one. + +It's possible to remove thinking traces by swapping with a closing tag. + +```xml +<|im_start|>user +Who are you?<|im_end|> +<|im_start|>assistant + +``` + +Yet, our current tests show a significantly decreased performance for most tasks, especially memorization of encyclopedic knowledge. + +For RAG, Baguettotron uses a special syntax to pass on references: + +```xml +<|im_start|>user +Who are you? + +[…] +[…] +<|im_end|> +<|im_start|>assistant + +``` + +Afterwards the model will return an answer with grounding references ([quote]). The draft will be affected as well and focus on source synthesis rather than reminiscence of internal knowledge base. + +## Fine-Tuning/RL + +Baguettotron has been successfully fine-tuned for a variety of tasks including text classification and poetry writing. + +Since it's a reasoning model, it should train well with reinforcement learning methods like GRPO, either for verifiable tasks or with a LLM-as-a-judge. \ No newline at end of file diff --git a/chat_template.json b/chat_template.json new file mode 100644 index 0000000..26de49b --- /dev/null +++ b/chat_template.json @@ -0,0 +1,7 @@ +{ + "chat_template": "{% for m in messages %}<|im_start|>{{ m['role'] }}\n{{ m['content'] }}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n\n{% endif %}", + "eos_token": "<|im_end|>", + "bos_token": "<|im_start|>", + "stop": ["<|im_end|>"], + "roles": { "user": "user", "assistant": "assistant", "system": "system" } +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..77ffe62 --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 576, + "initializer_range": 0.02, + "intermediate_size": 1536, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 9, + "num_hidden_layers": 80, + "num_key_value_heads": 3, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": true, + "vocab_size": 65536 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/figures/baguettotron_structure.png b/figures/baguettotron_structure.png new file mode 100644 index 0000000..3d2c223 Binary files /dev/null and b/figures/baguettotron_structure.png differ diff --git a/figures/comparison_models.png b/figures/comparison_models.png new file mode 100644 index 0000000..2142d53 Binary files /dev/null and b/figures/comparison_models.png differ diff --git a/figures/pleias.jpg b/figures/pleias.jpg new file mode 100644 index 0000000..8451fb5 Binary files /dev/null and b/figures/pleias.jpg differ diff --git a/figures/table_evaluation.png b/figures/table_evaluation.png new file mode 100644 index 0000000..7d72745 Binary files /dev/null and b/figures/table_evaluation.png differ diff --git a/figures/training_baguettotron.png b/figures/training_baguettotron.png new file mode 100644 index 0000000..f569690 Binary files /dev/null and b/figures/training_baguettotron.png differ diff --git a/figures/training_efficiency.jpeg b/figures/training_efficiency.jpeg new file mode 100644 index 0000000..ab9d002 Binary files /dev/null and b/figures/training_efficiency.jpeg differ diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..8df1deb --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..4535c5e --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbb3fe0fd0d97a28c140aa315ec4a651f20432e9b7a509908a620190f506644b +size 641995416 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..c018676 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,48 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "", + "source_1", + "source_2", + "source_3", + "source_4", + "source_5", + "source_6", + "source_7", + "source_8", + "source_9", + "source_10", + "", + "→", + "↺", + "※", + "?maybe?", + "●", + "◐", + "○", + "⚠", + "☐", + "☑", + "✓", + "⟨H≈0.1⟩", + "⟨H≈0.2⟩", + "⟨H≈0.3⟩", + "⟨H≈0.4⟩", + "⟨H≈0.5⟩", + "⟨H≈0.6⟩", + "⟨H≈0.7⟩", + "⟨H≈0.8⟩", + "⟨H≈0.9⟩", + "⟨H≈1.0⟩", + "⟨H≈1.1⟩", + "⟨H≈1.2⟩", + "⟨H≈1.3⟩", + "⟨H≈1.4⟩", + "⟨H≈1.5⟩", + "⟨H≈1.6⟩", + "⟨H≈1.7⟩", + "⟨H≈1.8⟩" + ] +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..f15c10a --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd28f1009d62677b4e550ab687efb89bec266535c783af4b4288bedf20f36c5 +size 4672723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..9d75626 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,447 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "[UNK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65491": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65492": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65493": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65494": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": true, + "special": false + }, + "65495": { + "content": "source_1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65496": { + "content": "source_2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65497": { + "content": "source_3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65498": { + "content": "source_4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65499": { + "content": "source_5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65500": { + "content": "source_6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65501": { + "content": "source_7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65502": { + "content": "source_8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65503": { + "content": "source_9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65504": { + "content": "source_10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65505": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65507": { + "content": "→", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65508": { + "content": "↺", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65509": { + "content": "※", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65510": { + "content": "?maybe?", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65511": { + "content": "●", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65512": { + "content": "◐", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65513": { + "content": "○", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65514": { + "content": "⚠", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65515": { + "content": "☐", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65516": { + "content": "☑", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65517": { + "content": "✓", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65518": { + "content": "⟨H≈0.1⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65519": { + "content": "⟨H≈0.2⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65520": { + "content": "⟨H≈0.3⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65521": { + "content": "⟨H≈0.4⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65522": { + "content": "⟨H≈0.5⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65523": { + "content": "⟨H≈0.6⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65524": { + "content": "⟨H≈0.7⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65525": { + "content": "⟨H≈0.8⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65526": { + "content": "⟨H≈0.9⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65527": { + "content": "⟨H≈1.0⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65528": { + "content": "⟨H≈1.1⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65529": { + "content": "⟨H≈1.2⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65530": { + "content": "⟨H≈1.3⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65531": { + "content": "⟨H≈1.4⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65532": { + "content": "⟨H≈1.5⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65533": { + "content": "⟨H≈1.6⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65534": { + "content": "⟨H≈1.7⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65535": { + "content": "⟨H≈1.8⟩", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "", + "source_1", + "source_2", + "source_3", + "source_4", + "source_5", + "source_6", + "source_7", + "source_8", + "source_9", + "source_10", + "", + "→", + "↺", + "※", + "?maybe?", + "●", + "◐", + "○", + "⚠", + "☐", + "☑", + "✓", + "⟨H≈0.1⟩", + "⟨H≈0.2⟩", + "⟨H≈0.3⟩", + "⟨H≈0.4⟩", + "⟨H≈0.5⟩", + "⟨H≈0.6⟩", + "⟨H≈0.7⟩", + "⟨H≈0.8⟩", + "⟨H≈0.9⟩", + "⟨H≈1.0⟩", + "⟨H≈1.1⟩", + "⟨H≈1.2⟩", + "⟨H≈1.3⟩", + "⟨H≈1.4⟩", + "⟨H≈1.5⟩", + "⟨H≈1.6⟩", + "⟨H≈1.7⟩", + "⟨H≈1.8⟩" + ], + "clean_up_tokenization_spaces": true, + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "tokenizer_class": "PreTrainedTokenizer", + "chat_template": "{% for m in messages %}<|im_start|>{{ m['role'] }}\n{{ m['content'] }}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n\n{% endif %}" +}