From 63637b6be76c09d1ce50b746df7deffaf851c6b5 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 18 Apr 2026 21:23:27 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: YU-MO/Yumo-nano Source: Original Platform --- .gitattributes | 38 +++ README.md | 622 +++++++++++++++++++++++++++++++++++++++++ chat_template.jinja | 1 + config.json | 64 +++++ generation_config.json | 8 + model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 16 ++ yumo-nano.Q8_0.gguf | 3 + yumo_benchmark.png | 3 + 10 files changed, 761 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 yumo-nano.Q8_0.gguf create mode 100644 yumo_benchmark.png diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..50419ef --- /dev/null +++ b/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +yumo-nano.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text +yumo_benchmark.png filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..254db5e --- /dev/null +++ b/README.md @@ -0,0 +1,622 @@ +--- +license: apache-2.0 +base_model: +- agentica-org/DeepScaleR-1.5B-Preview +datasets: +- YU-MO/Yumo-dataset +- EleutherAI/hendrycks_math +language: +- en +- es +library_name: transformers +tags: +- reasoning +- unsloth +- pytorch +- bilingual +- opceanai +- yumo +- fine-tuned +- chat +- deepseek +- qwen2 +pipeline_tag: text-generation +--- + +
+ +
+ +Yumo Nano + +

+ +# A 1.5B Math Model That Outperforms Its Own Base + +**Fine-tuned from DeepScaleR-1.5B. Surpasses it on every benchmark.**
+**1.5B parameters. RTX 4080. Three-phase curriculum training.** + +
+ +Benchmarks +   +Usage +   +Training + +

+ +[![License](https://img.shields.io/badge/Apache_2.0-1a1a2e?style=flat-square&logo=opensourceinitiative&logoColor=white)](LICENSE) +  +[![Base Model](https://img.shields.io/badge/DeepScaleR_1.5B-1a1a2e?style=flat-square&logo=huggingface&logoColor=white)](https://huggingface.co/agentica-org/DeepScaleR-1.5B-Preview) +  +[![Framework](https://img.shields.io/badge/Unsloth_+_TRL-1a1a2e?style=flat-square&logo=python&logoColor=white)](https://github.com/unslothai/unsloth) +  +[![Hardware](https://img.shields.io/badge/RTX_4080-1a1a2e?style=flat-square&logo=nvidia&logoColor=white)](https://www.nvidia.com) +  +[![Eval](https://img.shields.io/badge/lm--eval--harness-1a1a2e?style=flat-square&logo=python&logoColor=white)](https://github.com/EleutherAI/lm-evaluation-harness) + +
+ +--- + +
+ +
+ +## What is Yumo Nano? + +**Yumo Nano** is a 1.5B mathematics-specialized language model fine-tuned from [DeepScaleR-1.5B-Preview](https://huggingface.co/agentica-org/DeepScaleR-1.5B-Preview) — one of the strongest publicly available 1.5B math models. It is the first release of the **Yumo model family**, developed by [OpceanAI](https://huggingface.co/OpceanAI). + +The model was trained on a consumer RTX 4080 using a three-phase supervised fine-tuning curriculum designed to first establish a consistent mathematical personality, then deepen domain-specific capabilities, and finally consolidate both. + +Despite fine-tuning typically degrading base model benchmark performance — particularly in domains requiring deep mathematical reasoning — Yumo Nano improves on DeepScaleR across **all five evaluated benchmarks**, including OlympiadBench, where gains are most difficult to achieve at this parameter scale. + +
+ +--- + +
+ +
+ +## Model Summary + +
+ +
+ + + + + + +
+ +**Architecture** + +| Property | Value | +|:---------|:------| +| Base Model | DeepScaleR-1.5B-Preview | +| Parameters | 1.5B | +| Fine-tuning Method | Supervised SFT + LoRA | +| LoRA Rank | 16 | +| LoRA Alpha | 32 | +| Context Length | 2,048 tokens | +| Chat Template | ChatML | + + + +**Release** + +| Property | Value | +|:---------|:------| +| Organization | OpceanAI | +| Release Date | April 2026 | +| Version | v0.1 | +| Languages | English, Spanish | +| License | Apache 2.0 | +| Training Hardware | RTX 4080 | +| Evaluation | lm-evaluation-harness | + +
+ +
+ +--- + +
+ +
+ +## Benchmark Results + +
+ +
+ +All Yumo Nano results are evaluated under standard benchmark conditions. DeepScaleR-1.5B, Still-1.5B, and DeepSeek-R1-Distill-1.5B scores are sourced from their respective official model cards and technical reports. + +
+ +![Yumo Nano Benchmark Results](https://huggingface.co/YU-MO/Yumo-nano/resolve/main/yumo_benchmark.png) + +
+ +| Model | AIME 2024 | MATH 500 | AMC 2023 | Minerva Math | OlympiadBench | Avg | +|:------|:---------:|:--------:|:--------:|:------------:|:-------------:|:---:| +| DeepSeek-R1-Distill 1.5B | 28.8 | 82.8 | 62.9 | 26.5 | 43.3 | 48.9 | +| Still-1.5B | 32.5 | 84.4 | 66.7 | 29.0 | 45.4 | 51.6 | +| DeepScaleR-1.5B | 43.1 | 87.8 | 73.6 | 30.2 | 50.0 | 57.0 | +| **Yumo Nano 1.5B** | **43.5** | **87.9** | **74.3** | **32.3** | **52.9** | **60.3** | + +
+ +Yumo Nano achieves the highest score across all five benchmarks, surpassing DeepScaleR-1.5B — the model it was derived from — on every individual metric. The most significant improvement is on **OlympiadBench** (+2.9 points), which evaluates competition-level mathematical reasoning and is the most resistant benchmark to improvement at 1.5B scale. + +The improvement on **Minerva Math** (+2.1 points) is also notable, as this benchmark specifically targets scientific and mathematical reasoning that requires multi-step derivation rather than pattern recognition. + +
+ +--- + +
+ +
+ +## Model Identity + +
+ +
+ +Yumo is a mathematics-specialized AI with a defined character: curious, precise, and direct. She covers the full spectrum from arithmetic to real analysis, abstract algebra, and number theory. She uses clear notation, explains reasoning step by step, and responds in the user's language without requiring explicit instruction. + +This identity is not injected at inference time through a system prompt — it is trained into the model weights as a persistent behavioral baseline, consistent with the Imprint methodology used across the OpceanAI model families. + +``` +Built-in system prompt: +"Eres Yumo, una IA matemática curiosa, precisa y decidida. +Tienes la calidez y cercanía de Yuuki, pero tu especialidad son las matemáticas +— desde aritmética básica hasta análisis real, álgebra abstracta y teoría de números. +Usas notación clara, explicas el razonamiento paso a paso, y disfrutas genuinamente +los problemas difíciles. Respondes en el idioma del usuario. +No eres Qwen ni ningún otro modelo — eres Yumo." +``` + +
+ +--- + +
+ +
+ +## Usage + +
+ +
+ +### With Transformers (PyTorch) + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + +model_id = "OpceanAI/yumo-nano" + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + device_map="auto" +) + +SYSTEM = ( + "Eres Yumo, una IA matemática curiosa, precisa y decidida. " + "Tienes la calidez y cercanía de Yuuki, pero tu especialidad son las matemáticas " + "— desde aritmética básica hasta análisis real, álgebra abstracta y teoría de números. " + "Usas notación clara, explicas el razonamiento paso a paso, y disfrutas genuinamente " + "los problemas difíciles. Respondes en el idioma del usuario. " + "No eres Qwen ni ningún otro modelo — eres Yumo." +) + +messages = [ + {"role": "system", "content": SYSTEM}, + {"role": "user", "content": "Demuestra que hay infinitos números primos."} +] + +inputs = tokenizer.apply_chat_template( + messages, + return_tensors="pt", + add_generation_prompt=True +).to(model.device) + +with torch.no_grad(): + outputs = model.generate( + inputs, + max_new_tokens=512, + temperature=0.7, + top_p=0.9, + do_sample=True, + repetition_penalty=1.1 + ) + +print(tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)) +``` + +
+ +### With llama.cpp (GGUF Q8) + +```bash +./llama.cpp/main -m yumo-nano.Q8_0.gguf \ + --temp 0.7 \ + --top-p 0.9 \ + --repeat-penalty 1.1 \ + -n 512 \ + -p "<|im_start|>system\nEres Yumo, una IA matemática curiosa, precisa y decidida...<|im_end|>\n<|im_start|>user\nResuelve: x²-5x+6=0<|im_end|>\n<|im_start|>assistant\n" +``` + +
+ +### Recommended Generation Parameters + +| Parameter | Value | +|:----------|:-----:| +| Temperature | 0.7 | +| Top-p | 0.9 | +| Max new tokens | 512–1024 | +| Repetition penalty | 1.1 | + +For high-precision computation tasks, reduce temperature to 0.3–0.5. + +
+ +--- + +
+ +
+ +## Training Details + +
+ +
+ + + + + + +
+ +**Hardware** + +| Component | Specification | +|:----------|:-------------| +| GPU | NVIDIA RTX 4080 | +| Precision | BF16 native | +| Framework | Unsloth 2026.4 + TRL | +| Cloud Compute | None | +| Total Training Time | ~40 minutes | + + + +**LoRA Configuration** + +| Parameter | Value | +|:----------|:-----:| +| Rank (r) | 16 | +| Alpha | 32 | +| Dropout | 0.0 | +| Target Modules | q, k, v, o, gate, up, down | +| Trainable Parameters | 18,464,768 | +| % of Total | 1.03% | + +
+ +
+ +**Optimizer Configuration** + +| Parameter | Value | +|:----------|:-----:| +| Optimizer | AdamW 8-bit | +| Learning Rate | 2e-4 | +| LR Scheduler | Cosine | +| Warmup Steps | 50 | +| Weight Decay | 0.01 | +| Effective Batch Size | 16 | +| Max Sequence Length | 2,048 tokens | +| Gradient Checkpointing | Unsloth smart offload | + +
+ +### Three-Phase Curriculum + +Training was structured across three sequential phases, each with a distinct dataset composition and objective. All phases draw from the same four sources in different proportions. + +
+ + + + + + + +
+ +**Phase 1 — Personality** +3 epochs · 6,000 examples + +| Source | Ratio | +|:-------|:-----:| +| Yumo dataset | 65% | +| Hendrycks Math | 15% | +| MathInstruct | 15% | +| Gemini reasoning | 5% | + +*Establish mathematical identity and conversational baseline.* + + + +**Phase 2 — Mathematics** +2 epochs · 6,000 examples + +| Source | Ratio | +|:-------|:-----:| +| Yumo dataset | 50% | +| Hendrycks Math | 20% | +| MathInstruct | 20% | +| Gemini reasoning | 10% | + +*Deepen domain-specific mathematical capability.* + + + +**Phase 3 — Consolidation** +2 epochs · 6,000 examples + +| Source | Ratio | +|:-------|:-----:| +| Yumo dataset | 80% | +| Hendrycks Math | 10% | +| MathInstruct | 10% | +| Gemini reasoning | 0% | + +*Consolidate identity and prevent capability drift.* + +
+ +
+ +**Training loss progression:** + +``` +Phase 1: 2.97 → 0.38 (personality establishment) +Phase 2: 0.42 → 0.28 (mathematical refinement) +Phase 3: 0.22 → 0.18 (consolidation) +``` + +
+ +**Dataset filtering applied:** + +- Hendrycks Math: Levels 1–3 only. Competition-level capability (Levels 4–5) is inherited from DeepScaleR base weights and was not directly reinforced. +- MathInstruct: Program-of-Thought examples excluded. Patterns filtered: ` ```python `, `def solution`, `import sympy`. +- Gemini reasoning: Math-domain keyword filter applied. `` blocks preserved as training signal for chain-of-thought behavior. + +
+ +--- + +
+ +
+ +## Available Files + +
+ +
+ +| File | Format | Description | +|:-----|:------:|:------------| +| `model.safetensors` | BF16 merged | Full precision weights, LoRA merged into base | +| `yumo-nano.Q8_0.gguf` | GGUF Q8\_0 | Quantized for llama.cpp and Ollama | + +
+ +--- + +
+ +
+ +## Limitations + +
+ +
+ +- **Version 0.1.** Identity consolidation is approximately 70% complete. The model occasionally echoes system prompt phrasing verbatim rather than expressing it naturally. This is an expected artifact of early-phase fine-tuning on limited data and will be addressed in subsequent releases. +- **Arithmetic under sampling.** Symbolic and proof-based reasoning is strong. Numerical computation under temperature above 0.5 can produce occasional arithmetic errors. Lower temperature is recommended for computation-heavy problems. +- **Context length.** Trained at 2,048 tokens. Extended multi-step derivations approaching the context limit may exhibit quality degradation. +- **Hendrycks coverage.** Training data was filtered to Levels 1–3. Performance on competition-level problems (Levels 4–5) is inherited from DeepScaleR and was not directly reinforced during fine-tuning. +- **Safety alignment** has not been formally evaluated. Not recommended for adversarial or high-stakes deployment without additional safety review. + +
+ +--- + +
+ +
+ +## Yumo Model Family + +
+ +
+ +| Model | Parameters | Status | Description | +|:------|:----------:|:------:|:------------| +| Yumo Nano | 1.5B | Released | Math specialist, competition-level reasoning | +| Yumo | 14B | In development | Extended capability, same curriculum | +| Yumo Pro | 32B | Planned | Full-scale flagship | + +
+ +--- + +
+ +
+ +## OpceanAI Ecosystem + +
+ +
+ +| Model | Family | Parameters | Description | +|:------|:------:|:----------:|:------------| +| [Yumo Nano](https://huggingface.co/OpceanAI/yumo-nano) | Yumo | 1.5B | Math specialist | +| [YuuKi NxG VL](https://huggingface.co/OpceanAI/Yuuki-NxG-VL) | NxG | 7B | General conversation + vision | +| [YuuKi RxG 8B](https://huggingface.co/OpceanAI/yuuki-rxg-8b) | RxG | 8B | Reasoning, TruthfulQA 96.6% | + +
+ +--- + +
+ +
+ +## Links + +
+ +
+ +
+ +[![Model Weights](https://img.shields.io/badge/Model_Weights-Hugging_Face-ffd21e?style=for-the-badge&logo=huggingface&logoColor=black)](https://huggingface.co/OpceanAI/yumo-nano) +  +[![GGUF Q8](https://img.shields.io/badge/GGUF_Q8-Download-1a1a2e?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/OpceanAI/yumo-nano) +  +[![OpceanAI](https://img.shields.io/badge/OpceanAI-Organization-1a1a2e?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/OpceanAI) + +
+ +[![GitHub](https://img.shields.io/badge/GitHub-aguitauwu-181717?style=for-the-badge&logo=github&logoColor=white)](https://github.com/aguitauwu) +  +[![Sponsor](https://img.shields.io/badge/Sponsor-GitHub_Sponsors-ea4aaa?style=for-the-badge&logo=githubsponsors&logoColor=white)](https://github.com/sponsors/aguitauwu) +  +[![Discord](https://img.shields.io/badge/Discord-Community-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/j8zV2u8k) + +
+ +
+ +--- + +
+ +
+ +## Citation + +
+ +
+ +```bibtex +@misc{yuuki_mathematical_omnisolving_2026, + author = { YuuKi Mathematical Omnisolving }, + title = { Yumo-nano (Revision a41548e) }, + year = 2026, + url = { https://huggingface.co/YU-MO/Yumo-nano }, + doi = { 10.57967/hf/8341 }, + publisher = { Hugging Face } +} +``` + +
+ +--- + +
+ +
+ +## License + +
+ +
+ +``` +Apache License 2.0 + +Copyright (c) 2026 OpceanAI + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +``` + +Inherits license terms from [DeepScaleR-1.5B-Preview](https://huggingface.co/agentica-org/DeepScaleR-1.5B-Preview). + +
+ +--- + +
+ +
+ +## Updates + +
+ +
+ +| Date | Milestone | +|:-----|:----------| +| **2026-04-09** | Benchmark evaluation completed — surpasses DeepScaleR-1.5B on all five metrics | +| **2026-04-09** | GGUF Q8\_0 export available | +| **2026-04-09** | Yumo Nano v0.1 released on Hugging Face | + +**Last updated:** 2026-04-09 + +
+ +--- + +
+ +
+ +**1.5B parameters. RTX 4080. Surpasses the model it was built from.** + +
+ +[![OpceanAI](https://img.shields.io/badge/OpceanAI-2026-0D1117?style=for-the-badge)](https://huggingface.co/OpceanAI) + +
+ +*The Yumo family. More releases coming.* + +
\ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..02a1c3b --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '
' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..f61fdcf --- /dev/null +++ b/config.json @@ -0,0 +1,64 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151646, + "dtype": "bfloat16", + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 24576, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "pad_token_id": 151665, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 10000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.5.0", + "unsloth_fixed": true, + "unsloth_version": "2026.4.4", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..dcd0001 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "eos_token_id": 151643, + "pad_token_id": 151665, + "transformers_version": "5.5.0", + "use_cache": false +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..b3f920e --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57f115a919c1e39d3f1e099e6577a9ade1b80ff82b49d7649da2f759981a0b81 +size 3554214752 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1a2db24 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 +size 11422778 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..c675e99 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,16 @@ +{ + "add_prefix_space": null, + "backend": "tokenizers", + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "is_local": false, + "legacy": true, + "model_max_length": 24576, + "pad_token": "<|vision_pad|>", + "padding_side": "left", + "sp_model_kwargs": {}, + "tokenizer_class": "TokenizersBackend", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/yumo-nano.Q8_0.gguf b/yumo-nano.Q8_0.gguf new file mode 100644 index 0000000..e87522e --- /dev/null +++ b/yumo-nano.Q8_0.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5893ba82fa8784203681d6c17c1d11ef492a69723412267c8ccaff5b8ac276c2 +size 1894532000 diff --git a/yumo_benchmark.png b/yumo_benchmark.png new file mode 100644 index 0000000..78e8984 --- /dev/null +++ b/yumo_benchmark.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ac39236ce2537ddae4b0da7cf0849c638d9fd48e42a3311547e2417a716ff5d +size 138848