初始化项目，由ModelHub XC社区提供模型

Model: ilsp/Llama-Krikri-8B-Instruct Source: Original Platform
2026-05-11 03:30:32 +08:00
commit 8db905a538
16 changed files with 2758 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,38 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+arena_hard_el.jpg filter=lfs diff=lfs merge=lfs -text
+arena_hard_en.jpg filter=lfs diff=lfs merge=lfs -text
--- a/KriKri_Logo-eng_54307d80-ee25-49f9-9204-0ce774499fbc.svg
+++ b/KriKri_Logo-eng_54307d80-ee25-49f9-9204-0ce774499fbc.svg
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 205 70">
+  <defs>
+    <linearGradient id="linear-gradient" x1="32.94" y1="60.47" x2="51.13" y2="29.95" gradientUnits="userSpaceOnUse">
+      <stop offset="0" stop-color="#aff0ff"/>
+      <stop offset="1" stop-color="#c800c3"/>
+    </linearGradient>
+    <linearGradient id="linear-gradient-2" x1="10.71" y1="47.22" x2="28.9" y2="16.7" xlink:href="#linear-gradient"/>
+    <linearGradient id="linear-gradient-3" x1="19.28" y1="52.33" x2="37.47" y2="21.81" xlink:href="#linear-gradient"/>
+    <linearGradient id="linear-gradient-4" x1="34.17" y1="61.21" x2="52.36" y2="30.69" xlink:href="#linear-gradient"/>
+    <linearGradient id="linear-gradient-5" x1="21.76" y1="53.8" x2="39.94" y2="23.29" xlink:href="#linear-gradient"/>
+    <linearGradient id="linear-gradient-6" x1="2262.41" y1="22.1" x2="2262.41" y2="51.1" gradientTransform="translate(2317) rotate(-180) scale(1 -1) skewX(-1.53)" gradientUnits="userSpaceOnUse">
+      <stop offset="0" stop-color="#dcb9dc"/>
+      <stop offset=".53" stop-color="#9600c8"/>
+      <stop offset="1" stop-color="#530079"/>
+    </linearGradient>
+  </defs>
+  <g>
+    <path d="M113.85,55.68l-17.39-16.66v16.66h-2.58V22.46h2.58v16.56l15.93-16.56h3.21l-15.83,16.56,17.34,16.66h-3.26Z" fill="#32005a"/>
+    <path d="M132.9,38.15c-1.04-.94-1.88-1.59-2.51-1.95-.63-.36-1.26-.54-1.88-.54-1.07,0-2.21.54-3.41,1.61-1.2,1.07-2.4,2.63-3.6,4.68v13.74h-2.34v-21.87h2.34v5.75h.1c2.6-4.12,5.15-6.19,7.65-6.19.75,0,1.5.19,2.26.58s1.68,1.06,2.75,2l-1.36,2.19Z" fill="#32005a"/>
+    <path d="M136.85,26.46v-3.99h2.34v3.99h-2.34ZM136.85,55.68v-21.87h2.34v21.87h-2.34Z" fill="#32005a"/>
+    <path d="M166.36,55.68l-17.39-16.66v16.66h-2.58V22.46h2.58v16.56l15.93-16.56h3.21l-15.83,16.56,17.34,16.66h-3.26Z" fill="#32005a"/>
+    <path d="M185.41,38.15c-1.04-.94-1.88-1.59-2.51-1.95-.63-.36-1.26-.54-1.88-.54-1.07,0-2.21.54-3.41,1.61-1.2,1.07-2.4,2.63-3.6,4.68v13.74h-2.34v-21.87h2.34v5.75h.1c2.6-4.12,5.15-6.19,7.65-6.19.75,0,1.5.19,2.26.58s1.68,1.06,2.75,2l-1.36,2.19Z" fill="#32005a"/>
+    <path d="M189.36,26.46v-3.99h2.34v3.99h-2.34ZM189.36,55.68v-21.87h2.34v21.87h-2.34Z" fill="#32005a"/>
+  </g>
+  <g>
+    <path d="M33.26,68.4c.58-5.17,1.16-7.14,5.71-10.27.26-.35.6-.39.75-.39.89,0,1.6.64,3.9,9.57.11.44.19.75.23.89l.12.41c4.02-.13,5.4-.36,5.87-.49l-.04-6.92,1.15.79c.25.17.56.25.95.25,3.11,0,8.71-5.01,10.09-7.09.04-2.44-.78-5.07-1.57-7.62-.29-.93-.57-1.81-.79-2.67l-.56-2.08,4.62,3.67c.67-3.17.4-6.52-.78-9.35l-.44-1.04,1.15.07c1.49.1,2.53.14,3.25.14.07,0,.14,0,.21,0-.08-.06-.15-.11-.23-.17-1.19-.88-2.98-2.21-3.83-4.06-.38-.63-.6-1.02-.35-1.45.14-.24.4-.38.7-.38.13,0,.26.03.39.05.14.03.3.06.48.07.12,0,.23,0,.35,0,2.23,0,4.21-.75,5.44-2.07.99-1.06,1.42-2.38,1.28-3.93-.28-2.14-2.37-3.57-4.39-4.95-1.12-.77-2.29-1.57-3.13-2.49l-.27-.29.1-.37c.64-2.39.5-4.36-.43-6.01-.31.23-.62.59-.91.95-.48.58-.97,1.17-1.66,1.45l-.71.29-.27-.72c-2.2-5.85-8.32-10.1-14.55-10.1-.75,0-1.5.06-2.23.19-3.74.69-8.7,1.62-11.55,4.78,1.54-.24,2.91-.36,4.22-.36,4.31,0,7.82,1.34,10.44,3.98l1.85,1.86-2.54-.68c-2.56-.68-5.63-1.01-9.38-1.01-.12,0-.24,0-.35,0,1.14,4.81,4.97,8.25,9.55,8.48l.96.05-.27.89c-.72,2.38-2.69,4.86-4.94,6.34.26.05.53.09.77.12.71.09,1.44.19,1.66.85.22.67-.3,1.29-1.41,2.25-3.28,2.91-8.94,3.23-12.69,3.44-1.28.07-2.39.13-2.99.28-1.47.32-2.8.48-3.98.48-4.6,0-7.21-2.32-7.94-7.08-1.31,2-2.48,4.96-1.37,8.35l.13.4-.28.3c-.16.17-.39.27-.64.27-.44,0-.85-.29-1.36-.76-.04.57.09,1.18.39,1.83,1.01,2.17,3.59,4.06,5.41,4.56l.62.17-.08.62c-.1.75-.21,1.53-.33,2.28-.31,2.01-.63,4.09-.53,6.05.08-.03.15-.06.21-.09.54-.22.86-.35,1.16-.35.35,0,.56.18.65.29.16.18.22.42.18.65v.08s-.05.08-.05.08c-.97,2.2-1.72,2.77-2.33,3.13-.05.03-.13.07-.15.09-.15.27-.05,2.24,0,3.29.11,1.78.13,3.6.16,5.36.02,1.42.03,2.88.1,4.31.83.15,2.86.31,5.61.45l.03-.24c.55-3.97,1.02-7.4,5.19-9.42l1.04-.51.29,10.41c1.58.04,3.25.07,4.94.1l.03-.27Z" fill="#e6fff0"/>
+    <path d="M49.86,20.75l-1.6.19c-.5,5.8-.15,13.32-14.39,15.75-11.69,2.01-13.85,6.23-14.53,10.88,1.59-.86,1.85.43,1.85.43.21,2.22-1.32,8.02-2.96,9.95-1,1.15.39,3.65,5.18,1.43s6.89-3.15,6.89-3.15c1.85.43,9.56.57,9.9-.14-.79-2.22-5.32-11.64-4.22-11.71,4.25.24,8.8,24.38,10.58,24.32,1.17-.14,3.22-.36,3.22-.36l.53-12.96s9.28-1.36,9.83-2.86-.85-9.6-.85-9.6l4.33,2.79s-.18-9.24-1.46-10.24c1.63.64,3.17.5,3.17.5l-3.39-6.02c-3.83-.5-8.61-1-10.17-4.15-1.55-3.15-1.91-5.06-1.91-5.06Z" fill="url(#linear-gradient)"/>
+    <g>
+      <path d="M20.81,37.05c-3.24-.57-4.45-1.79-5.44-3.08.19,1.5,1.05,3.37,1.79,3.58-1.46.43-4.2-.64-4.77.07s2.07,3.37,3.56,3.87l1.84.36,3.02-4.8Z" fill="url(#linear-gradient-2)"/>
+      <path d="M54.45,13.85c-2.49-2.65-9.17-10.74-19.39-8.52,1.94,1,4.74,1.65,5.78,2.08,1.04.43,6.71,3.72,8.19,6.87.32,1.07,5.42-.43,5.42-.43Z" fill="url(#linear-gradient-3)"/>
+      <path d="M63.03,11.13l-2.85,3.29,2.76,1.65s1.55-2.22.08-4.94Z" fill="url(#linear-gradient-4)"/>
+      <path d="M49.81,18.65c-1.59-2.73-6.86-6.05-11.23-4.97.69,1.15,1.6,1.75,1.12,3.11,1.2.93,4.18,2.58,10.11,1.86Z" fill="url(#linear-gradient-5)"/>
+    </g>
+    <path d="M52.97,22.91c-2.57-3.85-3.43,7.08-2.11,11.29,1.32,4.21,6.38,12.14,5.93,14.83-2.34-3.37-3.24-4.72-4.31-5.9-.95,3.2-3.8,6.41-5.16,7.08.09,3.37-.77,18.49-.77,18.49l2.66.22,1.26-10.62,10.61-3.71-1.81-11.68,4.4,3.53s.64-9.2-1.52-10.98c3.17.5,3.24-.26,3.24-.26,0,0-3.43-5.23-3.45-6.24s-6.27-2.02-8.98-6.07Z" fill="url(#linear-gradient-6)"/>
+    <g>
+      <path d="M55.3,19.09c.24.3.2.82.49,1.25.48.73,1.34.93,2.16.8.35-.06,1.04-.56,1.11-.56.08,0,.64.51.81.33.16-.16-.34-1.54-.49-1.8-.52-.91-1.83-1.62-2.88-1.71-.39-.03-2.72-.04-2.78.31-.11.6,1.36,1.1,1.59,1.39Z" fill="#32005a"/>
+      <path d="M64.22,31.05c4.22.15,8.21-2.28,7.8-6.77-.49-3.76-5.41-5.35-7.71-7.86.64-2.38.67-4.92-.87-7.12-1.32.28-1.99,2.2-3.14,2.66C57.7,5.06,49.91.33,42.68,1.58c-4.7.87-10.46,2.06-13.07,6.52,4.78-.96,11.19-1.56,15.81,3.08-3.43-.91-7.3-1.09-10.8-1.03.87,5.17,4.86,9.63,10.42,9.91-.83,2.74-3.38,5.51-5.88,6.64,1.88,1.39,5.2-.03,2.18,2.6-4.23,3.76-12.91,2.95-15.38,3.56-8.94,1.96-10.93-2.64-11.17-7.85-2.49,2.9-3.88,6.68-2.62,10.51-.31.33-1.67-1.29-1.83-1.33-1.66,3.25,2.75,7.5,6,8.41-.39,3.07-1.15,6.32-.8,9.35.39.14,2.21-.94,2.14-.55-2.24,5.06-2.86.46-2.53,6.83.21,3.3.1,6.63.27,9.93H2c-.42,0-.76.34-.76.76s.34.76.76.76h188.94c.42,0,.76-.34.76-.76s-.34-.76-.76-.76H50.58l-.04-5.58c3.03,2.08,10.73-4.79,12.17-7.19.12-3.52-1.51-7.4-2.39-10.68l3.79,3.02c.99-3.47.87-7.48-.54-10.87,9.9.64,2-.83.09-5.11-.71-1.19-.33-.71.55-.68ZM62.39,12.36c1.02-1.08-.07,2.23-.16,2.38-.05.04-.89-.32-.97-.39-.5-.74.85-1.33,1.12-1.99ZM55.37,12.87c.1.31-.53.16-.74.18-.53.04-2.72.57-2.83.48.25-.78,1.38-1.19,2.35-1.5s1.05.44,1.22.84ZM53.94,10.95c0,.47-.95.18-1.21.28-.95.18-2.43,1.63-3.3,1.76-1.74-2.37-1.36-1.96.9-3.6,1.89-1.48,2.6.35,3.61,1.56ZM39.28,5.55c-.61-.42-3.12.14-3.19-.47.64-.36,3.02-1.3,3.75-1.22.13.11-.35,1.19-.56,1.69ZM42.96,7.37c-1.8-1.26-2.3-.9-1.67-2.57.63-1.67,3.17-.23,4.08-.03.52.37-.84.88-.92.95-.4.4-1.34,1.09-1.49,1.65ZM46.22,6.52c1.19-1.65,2.54-.05,3.8.68.1.54-.83.72-1.1.9-.71.53-1.73,1.34-2.29,2.05-.75-2.12-2.54-1.08-.41-3.63ZM13.55,38.66c-.58-.72.72-.28,1.23-.23,1.84.18,3.1-.51,3.61-1.58-.37-.31-1.53.72-3.36-.48-1.82-1.2-1.42-4.25-.73-6.16,1.23,2.53,3.43,4.31,6.14,5.14.28.09.74-.04.45.42-1.16.54-2.09,1.8-3.16,3.58-1.46,2.31-3.16.36-4.19-.68ZM34.03,68.16c.57-4.9,1.2-6.59,5.5-9.52.82-1.6,3.09,7.71,3.56,9.52h-9.06ZM22.51,68.16c.53-3.82.98-6.99,4.8-8.84l.24,8.84h-5.04ZM60.66,30.72c.85,1.48,1.76,2.93,2.99,4.14-8.4-3.26-.05.11-.87,9-.76-.19-2.51-2.08-2.98-2.54,0,0,1-.49.69-1.04s-3.9-2.63-4.06-2.46c2.06,3.94,2.8,10.08,1.98,14.8-1.07,1.58-5.1,1.47-7.09,2.11.31-3.7,2.06-6.64,1.69-10.45-.23-1.29-.67-2.17-.94-.3-.92,2.71-3.27,4.75-5.46,6.34.42.49,2.52.35,2.96-.43-.77,5.88-1.29,12.27-1.11,18.26h-2.88c-.64-.29-.49-1.1-1.41-3.16-1.87-5.22-5.39-20.5-8.21-19.57l1.37,4.95,1.64,5.43c-2.38.1-5.17.46-7.43-.3.79-1.02,1.62-2.57,1.87-3.93,0,0,1.06-6.82-2.15-6.16.95,5.01-1.05,10.1-5.66,12.8-.54,0,.56-.77.24-1.06-1.03-.36-1.14,1.24-1.9,1.72-2.63,1.39-7.43,4.14-6.46-1.26-.17-3.41,2.05-6.82,2.82-10.15.27-1.57-1.94.62-2.58.48.79-21.94,23.43-6.48,28.58-23.07-.93.36-1.89,1.08-2.94.91,1.75-1.59,3.09-3.53,4.05-5.68.78-.16,4.24.04,4.31-.97-.13-1.23-1.44-1.38-2.46-1.26-2.94-1.75-6.6-4.61-10.2-3.99,1.52.59,1.53.77,1.6,1.19.26,1.46,7.38,2.94,4.65,3.04-2.44-.17-6.19-1.2-6.61-3.86-5.04-2.9,3.9-1.73,5.84-1.32,2.7.45,3.61,2.44,6,2.98,2.47-.58,5.44-1.79,8.11-1,3,1.52,8.49,5.96,10.98,8.48.23.79-2-.48-2.13-.3-1.41,1.17,1.08,2.77,2.34,2.52-.34,2.5-3.27,3.03-4.74.42-.47-.93-.77-1.22-1.34-1.14-2.34,1.01,1.67,3.64,2.95,3.86-6.32,1.07-14.57-1.22-16.19-8.09l-1.85.24c.76,5.73,6.37,9.41,12,9.82Z" fill="#32005a"/>
+    </g>
+  </g>
+</svg>
--- a/README.md
+++ b/README.md
@@ -0,0 +1,211 @@
+---
+license: llama3.1
+language:
+- el
+- en
+pipeline_tag: text-generation
+library_name: transformers
+tags:
+- text-generation-inference
+base_model:
+- ilsp/Llama-Krikri-8B-Base
+---
+
+🚨 **PLEASE USE THE OFFICIAL QUANTIZED VERSIONS: [GGUF](https://huggingface.co/ilsp/Llama-Krikri-8B-Instruct-GGUF) OR REQUEST A SPECIFIC ONE** 🚨
+
+🚨 *There is no guarantee that you are using the latest improved versions from 3rd party quantizations as the model's weights are getting reuploaded!* 🚨
+
+# Llama-Krikri-8B-Instruct: An Instruction-tuned Large Language Model for the Greek language
+
+<div align="center">
+  <img src="https://huggingface.co/ilsp/Llama-Krikri-8B-Instruct/resolve/main/KriKri_Logo-eng_54307d80-ee25-49f9-9204-0ce774499fbc.svg?raw=true" width="60%" alt="Krikri" />
+</div>
+
+Following the release of [Meltemi-7B](https://huggingface.co/ilsp/Meltemi-7B-v1) on the 26th March 2024, we are happy to welcome Krikri to the family of ILSP open Greek LLMs.
+Krikri is built on top of [Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B), extending its capabilities for Greek through continual pretraining on a large corpus of high-quality and locally relevant Greek texts. We present **Llama-Krikri-8B-Instruct**, along with the base model, [Llama-Krikri-8B-Base](https://huggingface.co/ilsp/Llama-Krikri-8B-Base)
+
+<!-- ![image/png](llama-krikri-image.jpg) -->
+
+
+# Model Information
+
+## Base Model
+
+- Vocabulary extension of the Llama-3.1 tokenizer with Greek tokens
+- 128k context length (approximately 80,000 Greek words)
+- We extend the pretraining of Llama-3.1-8B with added proficiency for the Greek language, by utilizing a large training corpus. 
+  * This corpus includes 56.7 billion monolingual Greek tokens, constructed from publicly available resources.
+  * Additionaly, to mitigate catastrophic forgetting and ensure that the model has bilingual capabilities, we use additional sub-corpora with monolingual English texts (21 billion tokens) and Greek-English parallel data (5.5 billion tokens).
+  * The training corpus also contains 7.8 billion math and code tokens.
+  * This corpus has been processed, filtered, and deduplicated to ensure data quality and is outlined below:
+
+
+| Sub-corpus   | # Tokens         | Percentage |
+|-----------|------------------|------------|
+| Greek     | 56.7 B   | 62.3 %      |
+| English   | 21.0 B   | 23.1 %      |
+| Parallel  |  5.5 B   | 6.0 %       |
+| Math/Code |  7.8 B   | 8.6 %       |
+| **Total** | **91 B**   |  **100%**       |
+
+
+Chosen subsets of the 91 billion corpus were upsampled resulting in a size of **110 billion tokens**.
+
+## Instruct Model
+
+Llama-Krikri-8B-Instruct is the result of post-training Llama-Kriki-8B-Base and features:
+- Enhanced chat capabilities and instruction-following in both Greek and English.
+- Document translation from Greek to English, French, German, Italian, Portuguese, Spanish and vice versa.
+- Great performance on generation, comprehension, and editing tasks, such as summarization, creative content creation, text modification, entity recognition, sentiment analysis, etc.
+- Domain-specifc expertise for specialized legal, financial, medical, and scientific applications.
+- Retrieval-Augmented Generation (RAG) utilizing multiple documents with 128k context length. 
+- Improved coding and agentic capabilities with correct formatting and tool use.
+- Conversion or structured extraction (e.g., XML, JSON) in data-to-text & text-to-data settings.
+- Analytical thinking and Chain-of-Thought (CoT) reasoning for problem-solving.
+
+## Post-training Methodology
+
+We used a multi-stage process in order to build Llama-Krikri-8B-Instruct which includes:
+- 2-stage Supervised Fine-Tuning with a combination of Greek & English instruction-response pairs (& multi-turn conversations)
+  - **Stage 1**: **856,946** instruction-response pairs (371,379 Greek + 485,567 English)
+  - **Stage 2**: **638,408** instruction-response pairs (279,948 Greek + 358,460 English)
+- Alignment with a combination of Greek & English preference triplets (Instruction - Chosen Response - Rejected Response)
+  - **Length Normalized DPO**: **92,394** preference triplets (47,132 Greek + 45,262 English)
+
+## Post-training Data Construction
+
+To build the SFT & DPO data, we utilized various methodologies including:
+- Collecting existing high-quality datasets such as [Tulu 3](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture), [SmolTalk](https://huggingface.co/datasets/HuggingFaceTB/smoltalk), [MAGPIE Ultra](https://huggingface.co/datasets/argilla/magpie-ultra-v1.0), [Orca Agent Instruct](https://huggingface.co/datasets/microsoft/orca-agentinstruct-1M-v1), [IFEval Like Data](https://huggingface.co/datasets/argilla/ifeval-like-data), [UltraFeedback](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized), [NVIDIA HelpSteer2](https://huggingface.co/datasets/nvidia/HelpSteer2), [Intel Orca](https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs), [UltraMedical](https://huggingface.co/datasets/TsinghuaC3I/UltraMedical-Preference), and other datasets focused on safety, truthfulness, and instruction-following.
+- Translating various data into Greek using an in-house translation tool.
+- Regenerating translated data and contrasting the translated with the regenerated responses (i.e., for creating preference triplets).
+- Distilling (with the MAGPIE methodology) models which exhibit strong performance in Greek, such as [Gemma 2 27B IT](https://huggingface.co/google/gemma-2-27b-it).
+- Scoring data with the [Skywork Reward Gemma 2 27B v0.2](https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B-v0.2) Reward Model and filtering using rule-based filters.
+- Creating data for sentence and document translation using high-quality parallel corpora mainly from [ELRC-SHARE](https://elrc-share.eu/).
+- Synthetically extracting question-answer pairs and multi-turn dialogues from diverse sources such as Wikipedia, EUR-LEX, Greek School Books, and Kallipos.
+
+
+# How to use
+
+## With Transformers
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+device = "cuda"
+
+model = AutoModelForCausalLM.from_pretrained("ilsp/Llama-Krikri-8B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained("ilsp/Llama-Krikri-8B-Instruct")
+
+model.to(device)
+
+system_prompt = "Είσαι το Κρικρί, ένα εξαιρετικά ανεπτυγμένο μοντέλο Τεχνητής Νοημοσύνης για τα ελληνικα και εκπαιδεύτηκες από το ΙΕΛ του Ε.Κ. \"Αθηνά\"."
+user_prompt = "Σε τι διαφέρει ένα κρικρί από ένα λάμα;"
+
+messages = [
+    {"role": "system", "content": system_prompt},
+    {"role": "user", "content": user_prompt},
+]
+prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+input_prompt = tokenizer(prompt, return_tensors='pt').to(device)
+outputs = model.generate(input_prompt['input_ids'], max_new_tokens=256, do_sample=True)
+
+print(tokenizer.batch_decode(outputs)[0])
+```
+
+## With OpenAI compatible server via vLLM
+
+```bash
+vllm serve ilsp/Llama-Krikri-8B-Instruct \
+  --enforce-eager \
+  --dtype 'bfloat16' \
+  --api-key token-abc123
+```
+
+Then, the model can be used through Python using:
+```python
+from openai import OpenAI
+
+api_key = "token-abc123"
+base_url = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=api_key,
+    base_url=base_url,
+)
+
+system_prompt = "Είσαι ένα ανεπτυγμένο μεταφραστικό σύστημα που απαντάει με λίστες Python. Δεν γράφεις τίποτα άλλο στις απαντήσεις σου πέρα από τις μεταφρασμένες λίστες."
+user_prompt = "Δώσε μου την παρακάτω λίστα με μεταφρασμένο κάθε string της στα ελληνικά: ['Ethics of duty', 'Postmodern ethics', 'Consequentialist ethics', 'Utilitarian ethics', 'Deontological ethics', 'Virtue ethics', 'Relativist ethics']"
+
+messages = [
+    {"role": "system", "content": system_prompt},
+    {"role": "user", "content": user_prompt},
+]
+
+response = client.chat.completions.create(model="ilsp/Llama-Krikri-8B-Instruct",
+                                          messages=messages,
+                                          temperature=0.0,
+                                          top_p=0.95,
+                                          max_tokens=8192,
+                                          stream=False)
+
+print(response.choices[0].message.content)
+# ['Ηθική καθήκοντος', 'Μεταμοντέρνα ηθική', 'Συνεπειοκρατική ηθική', 'Ωφελιμιστική ηθική', 'Δεοντολογική ηθική', 'Ηθική αρετών', 'Σχετικιστική ηθική']
+```
+
+# Evaluation
+
+In the table below, we report the scores for our chat evaluation suite which includes:
+- [Greek IFEval](https://huggingface.co/datasets/ilsp/ifeval_greek) (strict average)
+- [English IFEval](https://huggingface.co/datasets/google/IFEval) (strict average)
+- [Greek MT-Bench](https://huggingface.co/datasets/ilsp/mt-bench-greek) using gpt-4o-2024-08-06 as the judge model.
+- [English MT-Bench](https://huggingface.co/datasets/HuggingFaceH4/mt_bench_prompts) using gpt-4o-2024-08-06 as the judge model.
+
+We can observe that *Llama-Krikri-8B-Instruct exhibits the strongest performance* in instruction following for both Greek and English across all the models we tested. In particular, it surpasses Llama-3.1-8B-Instruct by **+21.7%** and **+7.3%** on the Greek and English IFEval respectively.
+It also exhibits **the strongest chat capabilities in the Greek MT-Bench benchmark** (+0.28 compared to Aya Expanse 8B), while also being very competitive in the English variant of the MT-Bench benchmark.
+
+|       | IFEval EL (strict avg) | IFEval EN (strict avg) | MT-Bench EL | MT-Bench EN |
+|---------------- |---------------- |----------------- |------------|------------|
+| Qwen 2.5 7B Instruct | 46.2% | 74.8% | 5.83 | **7.87** |
+| EuroLLM 9B Instruct | 51.3% | 64.5% | 5.98 | 6.27 |
+| Aya Expanse 8B | 50.4% | 62.2% | 7.68 | 6.92 |
+| Meltemi 7B v1.5 Instruct | 32.7% | 41.2% | 6.25 | 5.46 |
+| Llama-3.1-8B Instruct | 45.8% | 75.1% | 6.46 | 7.25 |
+| **Llama-Krikri-8B Instruct** | **67.5%** | **82.4%** | **7.96** | 7.21 |
+
+
+We also used the [Arena-Hard-Auto](https://huggingface.co/datasets/lmarena-ai/arena-hard-auto-v0.1) automatic evaluation tool, as well the translated (and post-edited) version for Greek that is publicly available [here](https://huggingface.co/datasets/ilsp/m-ArenaHard_greek). We report 2 scores for Arena-Hard-Auto:
+- No Style Control: The original version of the benchmark.
+- With Style Control: The benchmark with style control methods for Markdown elements. You can read more about the methodology and technical background in this [blogspot](https://lmsys.org/blog/2024-08-28-style-control/).
+
+Below, we show the scores for the Greek version of Arena-Hard-Auto for various open and closed chat models that were determined using **gpt-4o-2024-08-06 as the judge model** and **gpt-4o-mini-2024-07-18 as the baseline model** (i.e., by default 50% score).
+
+Llama-Krikri-8B Instruct exhibits very strong chat capabilities by scoring **higher than models over 8 times its size** (such as Llama-3.1-70B Instruct) and is also **competitive with closed-source** (e.g., GPT-4o-Mini) and **highly-performant open-source models** (e.g., Gemma 2 27B IT & Aya Expanse 32B). 
+![image/png](arena_hard_el.png)
+
+Below, we show the scores for the original Arena-Hard-Auto dataset for various open and closed chat models. We followed the original methodology by using **gpt-4-1106-preview as the judge model** and **gpt-4-0314 as the baseline model**.
+
+Llama-Krikri-8B Instruct performs very well in the English variant of Arena-Hard-Auto as well, since we can observe that it is **competitive with similarly sized LLMs** and that it **improves upon Llama-3.1-8B Instruct by +24.5% / +16%** (No style control / With style control). 
+![image/png](arena_hard_en.png)
+
+***Please note** that judge models are biased towards student models trained on distilled data from them. You can read more [here](https://arxiv.org/pdf/2502.01534?).
+
+**If you want to dig deeper, read more in our [paper](https://arxiv.org/abs/2505.13772).**
+
+
+# Acknowledgements
+
+The ILSP team utilized Amazon's cloud computing services, which were made available via GRNET under the [OCRE Cloud framework](https://www.ocre-project.eu/), providing Amazon Web Services for the Greek Academic and Research Community.
+
+# Citation
+
+```
+@misc{roussis2025krikriadvancingopenlarge,
+      title={Krikri: Advancing Open Large Language Models for Greek}, 
+      author={Dimitris Roussis and Leon Voukoutis and Georgios Paraskevopoulos and Sokratis Sofianopoulos and Prokopis Prokopidis and Vassilis Papavasileiou and Athanasios Katsamanis and Stelios Piperidis and Vassilis Katsouros},
+      year={2025},
+      eprint={2505.13772},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.13772}, 
+}
+```
--- a/arena_hard_el.png
+++ b/arena_hard_el.png
--- a/arena_hard_en.png
+++ b/arena_hard_en.png
--- a/config.json
+++ b/config.json
@@ -0,0 +1,40 @@
+{
+  "_name_or_path": "/home/ubuntu/models/krikri-annealing-dpo-max-length-norm/",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.2",
+  "use_cache": false,
+  "vocab_size": 149248
+}
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,12 @@
+{
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.45.2"
+}
--- a/llama-krikri-image.jpg
+++ b/llama-krikri-image.jpg
--- a/model-00001-of-00004.safetensors
+++ b/model-00001-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:089e1492a5a75fdd2f5b8b8f578eb874fb0173e6ac45744e66c390203e8f83fe
+size 4913767264
--- a/model-00002-of-00004.safetensors
+++ b/model-00002-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f18cbfc939178c8247ddf2eb9f35966aa0f492644b40ba22223bfeb631175bfe
+size 4915916160
--- a/model-00003-of-00004.safetensors
+++ b/model-00003-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4dd717e13208f91ad7fd8064b2213c22d0040c1a57f98afe62dd6749e690db2
+size 4999819336
--- a/model-00004-of-00004.safetensors
+++ b/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15044dbfb317ba46597b1962922f7440ab80000e55e150177288141be2422b04
+size 1574986528
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 16404455424
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors"
+  }
+}
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,37 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|reserved_special_token_247|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer.json
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b12728a215cde26d96f3add66a7a10f0e47147afbf2e677cc0a70ba49228ada
+size 19508014
--- a/tokenizer_config.json
+++ b/tokenizer_config.json