初始化项目，由ModelHub XC社区提供模型

Model: hypaai/Hypa-Llama3.1-8b-SFT Source: Original Platform
2026-05-23 02:25:16 +08:00
commit 26cb8eea5d
13 changed files with 2877 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,36 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/Hypa_Llama3.jpg
+++ b/Hypa_Llama3.jpg
--- a/README.md
+++ b/README.md
@@ -0,0 +1,269 @@
+---
+library_name: transformers
+pipeline_tag: text-generation
+license: apache-2.0
+base_model:
+  - meta-llama/Llama-3.1-8B-Instruct
+datasets:
+  - hypaai/Hypa-Text-10k
+language:
+  - en
+  - ann
+  - efi
+  - ebi
+  - ego
+  - es
+  - fr
+  - ha
+  - ibb
+  - idm
+  - igl
+  - ig
+  - nup
+  - pcm
+  - tiv
+  - urh
+  - yo
+tags:
+  - llama
+  - llama-3.1
+  - multilingual
+  - low-resource
+  - underrepresented-languages
+  - translation
+  - language-detection
+  - dictionary
+  - tool-use
+  - function-calling
+  - unsloth
+  - lora
+  - qlora
+  - conversational
+  - hypa-ai
+---
+
+<div align="center">
+  
+![Hypa-Llama3.1 8B](https://huggingface.co/hypaai/Hypa-Llama3.1-8b-SFT/resolve/main/Hypa_Llama3.jpg)
+
+**A multilingual, tool-aware fine-tune of Meta's Llama 3.1 8B for low-resource and underrepresented languages.**
+
+[![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) [![Base: Llama 3.1 8B](https://img.shields.io/badge/Base-Llama%203.1%208B-blue)](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) [![GitHub: Hypa-Llama](https://img.shields.io/badge/GitHub-Hypa--Llama-black?logo=github)](https://github.com/hypaai/Hypa-Llama) [![Blog Post](https://img.shields.io/badge/Read-Blog%20Post-purple)](https://hypa-intelligence.hashnode.dev/tuning-llama-3-1-for-multilingual-dictionary-translation-and-tool-aware-language-understanding) [![Trained with Unsloth](https://img.shields.io/badge/Trained%20with-Unsloth-orange)](https://github.com/unslothai/unsloth)
+
+</div>
+
+## Model Description
+
+Hypa-Llama3.1 8B (`hypaai/Hypa-Llama3.1-8b-SFT`) is a LoRA-merged supervised fine-tune from the Llama 3.1 8B family, produced by [Hypa Intelligence](https://hypaintelligence.com). It is the Llama-flavored sibling of our [Hypa-Gemma 4 family](https://huggingface.co/collections/hypaai/hypa-gemma4), trained on the same multilingual instruction corpus and shaped around the same product surface, so customers and the open-source community can pick the runtime that best fits their deployment without changing the underlying capability surface.
+
+This release covers **seventeen languages**: English, French, Spanish, and fourteen languages of Nigeria. Several of the smaller languages in this set (including Annang, Ebira, Eggon, Idoma, Igala, Nupe, and Urhobo) have not been formally represented in large-scale fine-tuning corpora before, or had no settled ISO-style language tag at the time we needed one.
+
+The model is intended for translation, language detection, dictionary-style explanation (Markdown and JSON output modes), multilingual instruction-following, and translation correction / breakdown via an explicit reasoning channel. Unlike many fine-tunes, this is an **iterative SFT continuation** from one of our prior Hypa-Llama checkpoints rather than a from-scratch run on Meta's base model — each successive Hypa-Llama release inherits the capabilities of its predecessor and layers new prompt families on top.
+
+| Property                 | Value                                                            |
+| ------------------------ | ---------------------------------------------------------------- |
+| **Base model**           | `meta-llama/Llama-3.1-8B-Instruct` (continued from prior Hypa-Llama checkpoint) |
+| **Method**               | LoRA (r=256, α=256) via Unsloth + QLoRA, then merged to 16-bit   |
+| **Trainable parameters** | 671M / 8.7B (7.71%)                                              |
+| **Training data**        | 17.0M examples across multilingual instruction sub-datasets      |
+| **Compute**              | 1× NVIDIA GPU (Runpod), 10.9 days                                |
+| **Languages**            | 17                                                               |
+| **Context window**       | 128K (config); 2,048 tokens during training                      |
+| **License**              | Apache 2.0 + Llama 3.1 Community License                         |
+
+## Quick Start
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+model_id = "hypaai/Hypa-Llama3.1-8b-SFT"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+
+messages = [
+    {"role": "system", "content": "You are Hypa Translate. Translate from English to Igbo. Return only the exact translation."},
+    {"role": "user", "content": "Good morning, how are you today?"},
+]
+
+inputs = tokenizer.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+).to(model.device)
+
+outputs = model.generate(
+    inputs,
+    max_new_tokens=256,
+    temperature=1.0,
+    top_p=0.95,
+    top_k=30,
+    min_p=0.1,
+    do_sample=True,
+)
+
+print(tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True))
+```
+
+For **thinking mode**, prepend the literal marker `<|think>` to your system prompt content (e.g. `"<|think>\nYou are Hypa Translate. Correct the below translation to Igbo."`). The model will emit a `<think>...</think>` reasoning block before its visible answer.
+
+For **JSON dictionary mode**, use the JSON-schema system prompts documented in the [blog post](https://hypa-intelligence.hashnode.dev/tuning-llama-3-1-for-multilingual-dictionary-translation-and-tool-aware-language-understanding) and parse the assistant response directly.
+
+For **vLLM serving**, the standard `vllm serve hypaai/Hypa-Llama3.1-8b-SFT` command works out of the box. See the [blog post](https://hypa-intelligence.hashnode.dev/tuning-llama-3-1-for-multilingual-dictionary-translation-and-tool-aware-language-understanding) for the tokenizer-config compatibility steps if you hit deployment errors.
+
+## Languages Covered
+
+| Code  | Language | Code  | Language |
+| ----- | -------- | ----- | -------- |
+| `en`  | English  | `ibb` | Ibibio   |
+| `ann` | Annang   | `idm` | Idoma    |
+| `efi` | Efik     | `igl` | Igala    |
+| `ebi` | Ebira    | `ig`  | Igbo     |
+| `ego` | Eggon    | `nup` | Nupe     |
+| `es`  | Spanish  | `pg`  | Pidgin   |
+| `fr`  | French   | `tiv` | Tiv      |
+| `ha`  | Hausa    | `urh` | Urhobo   |
+| `yo`  | Yoruba   |       |          |
+
+Some of the smaller languages in this set required custom or non-standard tags because no widely-adopted machine-readable code existed at the time of training. Where ISO 639-3 codes were available, we used them; where they were not, we documented our internal codes in the data release so downstream users can reproduce splits.
+
+## Training Data
+
+Training data comprises **17.0 million examples** assembled from a large multilingual text mixture combining internal Hypa datasets and public instruction-style corpora. The mixture is identical to the one used for our [Hypa-Gemma 4 family](https://huggingface.co/collections/hypaai/hypa-gemma4), enabling clean capability parity across model families. The overall training mixture included dictionary-style data, translation data, language detection data, synthetic instruction data, structured-JSON output data, and chain-of-thought translation breakdown / correction data — each contributing a different signal.
+
+A **public 10k subset** of the training data is released as [`hypaai/Hypa-Text-10k`](https://huggingface.co/datasets/hypaai/Hypa-Text-10k). Additional sub-datasets are progressively being released under the [`hypaai`](https://huggingface.co/hypaai) organization.
+
+### Prompt Formatting
+
+Every example was formatted using Llama 3.1's native chat template, with explicit `system`, `user`, and `assistant` roles and the canonical Llama 3 control tokens (`<|begin_of_text|>`, `<|start_header_id|>`, `<|end_header_id|>`, `<|eot_id|>`, `<|end_of_text|>`). The reasoning channel was implemented via the literal markers `<|think>` (in the system prompt) and `<think>...</think>` (wrapping assistant reasoning) — these are byte-pair-tokenized regular strings rather than added special tokens, which keeps the tokenizer canonical and avoids vocabulary surgery during serving.
+
+Loss was computed only on assistant turns via `train_on_responses_only` with `instruction_part="<|start_header_id|>user<|end_header_id|>\n\n"` and `response_part="<|start_header_id|>assistant<|end_header_id|>\n\n"`.
+
+## Training Procedure
+
+| Hyperparameter           | Value                                                                |
+| ------------------------ | -------------------------------------------------------------------- |
+| LoRA rank (r)            | 256                                                                  |
+| LoRA alpha (α)           | 256                                                                  |
+| LoRA dropout             | 0                                                                    |
+| Target modules           | q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj        |
+| Quantization             | 4-bit base (NF4), bf16 compute                                       |
+| Optimizer                | AdamW 8-bit                                                          |
+| Learning rate            | 1e-4                                                                 |
+| LR schedule              | cosine, 500 warmup steps                                             |
+| Weight decay             | 0.01                                                                 |
+| Max grad norm            | 1.0                                                                  |
+| Per-device batch size    | 16                                                                   |
+| Gradient accumulation    | 2                                                                    |
+| **Effective batch size** | **32**                                                               |
+| Sequence length          | 2048                                                                 |
+| Packing                  | enabled                                                              |
+| Epochs                   | 1                                                                    |
+| **Total steps**          | **532,418**                                                          |
+| Precision                | bfloat16                                                             |
+| Gradient checkpointing   | enabled (Unsloth)                                                    |
+| Hardware                 | 1× NVIDIA GPU (Runpod)                                               |
+| Runtime                  | 10.9 days (261h 50m)                                                 |
+| Random seed              | 3407                                                                 |
+
+Training was performed using [Unsloth](https://github.com/unslothai/unsloth), which provides hand-tuned Triton kernels for the attention and MLP forward/backward passes and an "unsloth" gradient checkpointing variant that uses ~30% less VRAM than vanilla checkpointing.
+
+## Evaluation and Recommendations
+
+### Training metrics
+
+- **Final training loss**: 0.213 (smooth monotonic decay from 0.971)
+- **Best evaluation loss**: 0.330 (at end of training)
+- **Final evaluation loss**: 0.330
+
+### Honest note on training dynamics
+
+Unlike our [Hypa-Gemma 4 E2B run](https://huggingface.co/hypaai/Hypa-Gemma4-E2B-v1), this Llama 3.1 run showed **clean, well-behaved training dynamics**. Both training and validation loss decreased monotonically across the entire 532,418-step run. The train-eval gap widened mildly through step 240k (peaking at 0.152) and then *narrowed* back to 0.117 by end of training — the signature of a model still fitting the data distribution rather than memorizing it. A final train:eval ratio of 0.213:0.330 ≈ 1.55× is on the healthy side for instruction tuning at this scale.
+
+**For downstream use, we recommend the merged 16-bit weights in this repository.** The final checkpoint is the best checkpoint by evaluation loss; there is no separate "best" intermediate to recover.
+
+That said, the final ~50,000 steps of training (roughly the last ~10% of the run) produced only ~0.6% of the total eval-loss improvement. With `EarlyStoppingCallback(early_stopping_patience=2)` configured against eval loss, training would have halted near step 480k–490k and saved approximately 25 hours of compute with negligible quality cost. We've queued this for the next run.
+
+### Qualitative observations
+
+Internal qualitative review on translation and dictionary tasks shows meaningful improvements over the base Llama 3.1 8B-Instruct for every language in the set, with the largest deltas on the smallest languages (Annang, Efik, Ibibio, Eggon, Idoma, Igala, Nupe, Urhobo), where the base model was effectively unusable. Quantitative chrF++, BLEU, and BLEURT results across language pairs will follow in a separate evaluation post.
+
+## Intended Use
+
+**Direct use cases:**
+
+- Translation between English / French / Spanish and the fourteen covered low-resource languages
+- Language detection across all seventeen languages
+- Dictionary-style lexical lookup and explanation (Markdown output)
+- Dictionary-style lexical lookup with strict JSON schema (programmatic use)
+- Translation correction and chain-of-thought translation breakdown (via the `<|think>` reasoning channel)
+- Multilingual instruction-following on dialogue tasks
+- Tool-aware / function-calling-style prompting (inheriting Llama 3.1's native tool-call structure)
+
+**Downstream use:**
+
+- Suitable as a starting point for further fine-tuning on more specialized tasks within the supported languages
+- Suitable for adapter stacking (e.g., domain-specific LoRA on top)
+- Drop-in replacement for `meta-llama/Llama-3.1-8B-Instruct` in any text-generation pipeline that needs improved low-resource language quality
+
+## Out-of-Scope and Limitations
+
+- **Not safety-tuned for sensitive domains.** This model has not undergone RLHF or DPO post-training beyond the SFT in this run. It should not be used unsupervised for medical, legal, financial, or psychological-counseling applications.
+- **Quality varies by language.** The smallest languages in the set are underrepresented even within our training mix and the resulting model output should be reviewed by native speakers before being used in production.
+- **Training context is 2,048 tokens.** The model's config advertises a 131,072-token context window (inherited from Llama 3.1), but quality past 2,048 tokens is bounded by the training distribution and has not been validated for the target languages.
+- **Tokenization quality.** Llama 3's 128k-vocabulary BPE tokenizer is broader than smaller-vocabulary tokenizers but the smallest languages in this release still tokenize at higher cost per character than English. This is a gap we expect future iterations to close, including potential vocabulary extension.
+- **JSON output reliability.** Although we trained extensively on the JSON output schema, rare prompts occasionally produce minor schema deviations (extra whitespace, optional-field ordering). Production use of JSON mode should wrap responses in a permissive parser with single-attempt repair.
+- **Coverage is finite.** The seventeen languages in this release are the start, not the end. Many other underrepresented languages are not yet supported and may produce unreliable output.
+
+## Bias, Risks, and Limitations
+
+This model inherits the biases and limitations of its base model ([Meta Llama 3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)) and adds the biases of its fine-tuning corpus, which is weighted toward dictionary, religious-parallel, and CommonVoice text. Religious-parallel text in particular is a known cause of register and content bias in low-resource translation models. Users deploying this model in customer-facing applications should evaluate output for cultural appropriateness in their specific use case and language.
+
+The model is not intended to make decisions affecting people's rights, health, finances, or wellbeing. Like all language models, it can produce confident-sounding output that is incorrect, particularly on the smallest languages where training data was thinnest.
+
+## Released Artifacts
+
+- 🤗 **Merged 16-bit model (this repo)**: [`hypaai/Hypa-Llama3.1-8b-SFT`](https://huggingface.co/hypaai/Hypa-Llama3.1-8b-SFT)
+- 🤗 **LoRA adapter checkpoints**: [`hypaai/Hypa-Llama3.1-8b-SFT-LoRAs`](https://huggingface.co/hypaai/Hypa-Llama3.1-8b-SFT-LoRAs)
+- 📊 **TensorBoard metrics**: [view on HF](https://huggingface.co/hypaai/Hypa-Llama3.1-8b-SFT-LoRAs/tensorboard)
+- 📦 **Public training data subset**: [`hypaai/Hypa-Text-10k`](https://huggingface.co/datasets/hypaai/Hypa-Text-10k)
+- 💻 **GitHub repository**: [`hypaai/Hypa-Llama`](https://github.com/hypaai/Hypa-Llama)
+- 📝 **Blog post**: [Tuning Llama 3.1 for multilingual dictionary, translation, and tool-aware language understanding](https://hypa-intelligence.hashnode.dev/tuning-llama-3-1-for-multilingual-dictionary-translation-and-tool-aware-language-understanding)
+
+## Citation
+
+If you use Hypa-Llama3.1 8B or any of the related work, please cite:
+
+```bibtex
+@misc{hypaai2026hypallama318b,
+  title        = {Hypa-Llama3.1 8B: A Multilingual Fine-Tune of Llama 3.1 for Underrepresented Languages},
+  author       = {{Hypa Intelligence}},
+  year         = {2026},
+  publisher    = {Hugging Face},
+  howpublished = {\url{https://huggingface.co/hypaai/Hypa-Llama3.1-8b-SFT}},
+  note         = {Apache 2.0 + Llama 3.1 Community License. Blog: \url{https://hypa-intelligence.hashnode.dev/tuning-llama-3-1-for-multilingual-dictionary-translation-and-tool-aware-language-understanding}}
+}
+```
+
+## License
+
+Released under the **Apache License 2.0**. As a derivative of Meta's Llama 3.1, this model is additionally subject to the **Llama 3.1 Community License**. Free to use, modify, and redistribute for both research and commercial purposes under the combined terms of both licenses.
+
+## Acknowledgments
+
+- **Meta AI** for releasing Llama 3.1 openly and enabling this line of research.
+- **Unsloth** for the hand-tuned training kernels that made an 11-day, 17M-example single-GPU run practical.
+- **Runpod** for reliable GPU infrastructure.
+- The **language communities, speakers, and reviewers** whose texts, voices, and feedback grounded this work and keep it honest.
+
+---
+
+**Hypa Intelligence** • [Website](https://hypaintelligence.com) • [Hugging Face](https://huggingface.co/hypaai) • [GitHub](https://github.com/hypaai) • [Blog](https://hypaintelligence.com/updates)
+
+*Multilingualism is not a feature. It is a prerequisite for AI that represents all of us.*
--- a/chat_template.jinja
+++ b/chat_template.jinja
@@ -0,0 +1,139 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 July 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>
+
+" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython
+" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "
+
+"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023
+" }}
+{{- "Today Date: " + date_string + "
+
+" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.
+
+" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "
+
+" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content'] %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>
+
+' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.
+
+" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.
+
+" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "
+
+" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+'+ message['content'] + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>
+
+' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>
+
+' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>
+
+" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>
+
+' }}
+{%- endif %}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,37 @@
+{
+    "architectures": [
+        "LlamaForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 128000,
+    "torch_dtype": "bfloat16",
+    "eos_token_id": 128001,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 2048,
+    "mlp_bias": false,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "pad_token_id": 128001,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_parameters": {
+        "factor": 8.0,
+        "high_freq_factor": 4.0,
+        "low_freq_factor": 1.0,
+        "original_max_position_embeddings": 8192,
+        "rope_theta": 500000.0,
+        "rope_type": "llama3"
+    },
+    "tie_word_embeddings": false,
+    "unsloth_fixed": true,
+    "unsloth_version": "2026.4.8",
+    "use_cache": false,
+    "vocab_size": 128256
+}
--- a/model-00001-of-00004.safetensors
+++ b/model-00001-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3131c81f793d2f828f8802e60964ca66770e4f97fb4ac6bc43c3b615d144343f
+size 4976698672
--- a/model-00002-of-00004.safetensors
+++ b/model-00002-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3987d6cee3239deb450dd7d801d9fe6178b3afe6d0c4c51429bd3a5b3bb50e95
+size 4999802720
--- a/model-00003-of-00004.safetensors
+++ b/model-00003-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d34e6e759417644a9720455ec22612cc8eb2dc48c7fb18ddfeb30c8d77acc0cb
+size 4915916176
--- a/model-00004-of-00004.safetensors
+++ b/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46c9bf7d32eeabbceaa9fe89d9030ab81a8c70f343d2992a7a952d42cfcdec29
+size 1168138808
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 16060522496
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors"
+  }
+}
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,17 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|finetune_right_pad_id|>"
+}
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json