From 44da34d8e3ead70856e492a59e1251a9be3d2c3f Mon Sep 17 00:00:00 2001
From: ModelHub XC <noreply@modelhub.org.cn>
Date: Thu, 21 May 2026 22:44:19 +0800
Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?=
 =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?=
 =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Model: SuperAGI/mistral-7B-PoSE-32k
Source: Original Platform
---
 .gitattributes                   |   35 +
 README.md                        |   48 ++
 added_tokens.json                |    3 +
 config.json                      |   30 +
 generation_config.json           |    6 +
 model-00001-of-00003.safetensors |    3 +
 model-00002-of-00003.safetensors |    3 +
 model-00003-of-00003.safetensors |    3 +
 model.safetensors.index.json     |  298 +++++++
 my_configuration_mistral.py      |  183 ++++
 my_modeling_mistral.py           | 1187 +++++++++++++++++++++++++
 special_tokens_map.json          |   30 +
 tokenizer.model                  |    3 +
 tokenizer_config.json            |   52 ++
 trainer_state.json               | 1379 ++++++++++++++++++++++++++++++
 training_args.bin                |    3 +
 zero_to_fp32.py                  |  578 +++++++++++++
 17 files changed, 3844 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 README.md
 create mode 100644 added_tokens.json
 create mode 100644 config.json
 create mode 100644 generation_config.json
 create mode 100644 model-00001-of-00003.safetensors
 create mode 100644 model-00002-of-00003.safetensors
 create mode 100644 model-00003-of-00003.safetensors
 create mode 100644 model.safetensors.index.json
 create mode 100644 my_configuration_mistral.py
 create mode 100644 my_modeling_mistral.py
 create mode 100644 special_tokens_map.json
 create mode 100644 tokenizer.model
 create mode 100644 tokenizer_config.json
 create mode 100644 trainer_state.json
 create mode 100644 training_args.bin
 create mode 100644 zero_to_fp32.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..a6344aa
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d54acda
--- /dev/null
+++ b/README.md
@@ -0,0 +1,48 @@
+---
+license: apache-2.0
+
+---
+# Model Card for Mistral-7B-32K-PoSE
+The Mistral-7B-32K-PoSE Large Language Model (LLM) is a PoSE trained Mistral 7B for 32k context length.
+The PoSE technique of extending the context length of Mistral-7B outperforms the passkey retrieval task with only a marginal impact on the standard benchmarks.
+
+For full details of this model please read our [release blog post](https://superagi.com/extending-context-window-of-a-7b-llm-from-8k-to-32k-using-pose-positional-skip-wise).
+
+# Results
+## PassKey retrieval
+
+<img src="https://cdn-uploads.huggingface.co/production/uploads/655b8d65a8ec3f330f2089c8/Ke8Ge8Xcw6A53PRnXjFE8.png" alt="Alt text" width="700" height="700">
+
+The evaluation focuses on their effectiveness in passkey retrieval, highlighting the impact of varying context lengths on the models ability to extract crucial information. Our model excels in information extraction, capable of handling context lengths up to 32k, surpassing the limitations of the original Mistral7B model which could pass the test cases only if the context window was under 8k.
+
+## Standard Benchmarking
+
+<img src="https://cdn-uploads.huggingface.co/production/uploads/655b8d65a8ec3f330f2089c8/9AAOxpyoCh0UOFHIO8YkV.png" alt="Alt text" width="700" height="200">
+Our model achieves an extension to 32k while only experiencing a marginal impact on the standard benchmark accuracy. This demonstrates a commendable ability to handle longer contexts without significantly compromising overall performance.
+
+## Run the model
+
+```python
+from transformers import AutoTokenizer
+from my_modeling_mistral import MistralForCausalLM
+
+model_id = "SuperAGI/mistral-7B-PoSE-32k"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+model = MistralForCausalLM.from_pretrained(model_id)
+
+text = "Hello my name is"
+inputs = tokenizer(text, return_tensors="pt")
+
+outputs = model.generate(**inputs, max_new_tokens=20)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+
+## Limitations
+
+The Mistral-7B-32K-PoSE model is a demonstration that the context length can be extended without losing much on the performance.
+It does not have any moderation mechanisms. The model is not suitable for production usage as it doesn't have guardrails for toxicity, societal bias, and language limitations. We would love to collaborate with the community to build safer and better models.
+
+## The SuperAGI AI Team
+Ishaan Bhola, Mukunda NS, Rajat Chawla, Anmol Gautam, Arkajit Datta, Ayush Vatsal, Sukrit Chatterjee, Adarsh Jha, Adarsh Deep, Abhijeet Sinha, Rakesh Krishna.
\ No newline at end of file
diff --git a/added_tokens.json b/added_tokens.json
new file mode 100644
index 0000000..e41416d
--- /dev/null
+++ b/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32000
+}
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..0fd7c9b
--- /dev/null
+++ b/config.json
@@ -0,0 +1,30 @@
+{
+  "_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "original_max_position_embeddings": 4096,
+    "type": "yarn"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.35.0",
+  "use_cache": false,
+  "vocab_size": 32001
+}
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..38dd6f7
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.35.0"
+}
diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors
new file mode 100644
index 0000000..be68e02
--- /dev/null
+++ b/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb4bf787a14c97d4a8dad27f231313f6b28af9f7f24ebe5a38e52627d3fa7371
+size 4943170432
diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors
new file mode 100644
index 0000000..3177f49
--- /dev/null
+++ b/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f4afbfbb4fbbc9a956513c32257cc883ca727dc6f0674fa4673e5db3e58095
+size 4999819232
diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors
new file mode 100644
index 0000000..81aa3c0
--- /dev/null
+++ b/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a500aa07f4547e4886c7ec307c4ca508b881d1be6e2617677f2762d79152bead
+size 4540524448
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000..74703d2
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 14483480576
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}
diff --git a/my_configuration_mistral.py b/my_configuration_mistral.py
new file mode 100644
index 0000000..3749722
--- /dev/null
+++ b/my_configuration_mistral.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Modification Copyright 2023 Dawei Zhu
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mistral model configuration"""
+
+# from ...configuration_utils import PretrainedConfig
+# from ...utils import logging
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+# MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+#     "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
+#     "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
+# }
+# MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+class MistralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an Mistral
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Mistral-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        pretraining_tp (`int`, *optional*, defaults to `1`):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalMistral/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
+        Example:
+
+    ```python
+    >>> from transformers import MistralModel, MistralConfig
+
+    >>> # Initializing a Mistral Mistral-7b style configuration
+    >>> configuration = MistralConfig()
+
+    >>> # Initializing a model from the Mistral-7b style configuration
+    >>> model = MistralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_scaling=None,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.rope_theta = rope_theta
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict):
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "vanilla_ntk", "yarn"]:
+            raise ValueError(
+                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic', 'vanilla_ntk', 'yarn'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/my_modeling_mistral.py b/my_modeling_mistral.py
new file mode 100644
index 0000000..56fdd6c
--- /dev/null
+++ b/my_modeling_mistral.py
@@ -0,0 +1,1187 @@
+# coding=utf-8
+# Copyright 2022 MistralAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import math
+from typing import List, Optional, Tuple, Union
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+
+from transformers.models.mistral.configuration_mistral import MistralConfig
+
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+def _make_sliding_window_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+    sliding_window: int = 4096,
+):
+    """
+    Make causal mask used for sliding window attention
+    """
+    bsz, tgt_len = input_ids_shape
+    tensor = torch.full(
+        (tgt_len, tgt_len),
+        fill_value=1,
+        device=device,
+    )
+    mask = torch.tril(tensor, diagonal=0)
+    # make the mask banded to account for sliding window
+    mask = torch.triu(mask, diagonal=-sliding_window)
+    mask = torch.log(mask).to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+    
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class MistralRMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+class MistralRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=4096, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        # t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = np.arange(self.max_seq_len_cached, dtype=np.float64)
+        t = torch.tensor(t, device=self.inv_freq.device, dtype=torch.float64)
+
+        # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq.to(device=t.device).to(t.dtype))
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:, :, :, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :, ...].to(dtype=x.dtype),
+        )
+
+class MistralLinearScalingRotaryEmbedding(MistralRotaryEmbedding):
+    """MistralRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=4096, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        t = np.arange(self.max_seq_len_cached, dtype=np.float64)
+        t = t / self.scaling_factor
+        t = torch.tensor(t, device=self.inv_freq.device, dtype=torch.float64)
+
+        freqs = torch.outer(t, self.inv_freq.to(device=t.device).to(t.dtype))
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+class MistralVanillaNTKScalingRotaryEmbedding(MistralRotaryEmbedding):
+
+    def __init__(self, dim, max_position_embeddings=4096, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        base = self.base * self.scaling_factor ** (self.dim / (self.dim - 2))
+        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = np.arange(self.max_seq_len_cached, dtype=np.float64)
+        t = torch.tensor(t, device=self.inv_freq.device, dtype=torch.float64)
+
+        # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq.to(device=t.device).to(t.dtype))
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+class MistralDynamicNTKScalingRotaryEmbedding(MistralRotaryEmbedding):
+    """MistralRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=4096, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+# Inverse dim formula to find dim based on number of rotations
+def _yarn_find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=4096):
+    return (dim * math.log(max_position_embeddings/(num_rotations * 2 * math.pi)))/(2 * math.log(base))
+
+# Find dim range bounds based on rotations
+def _yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=4096):
+    low = math.floor(_yarn_find_correction_dim(
+        low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(_yarn_find_correction_dim(
+        high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim-1)  # Clamp values just in case
+
+def _yarn_linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+def _yarn_get_mscale(scale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.07 * math.log(scale) + 1.0
+
+
+class MistralYaRNScaledRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=4096, base=10000, scale=1, original_max_position_embeddings=4096, extrapolation_factor=1, attn_factor=1, beta_fast=32, beta_slow=1, finetuned=False, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.scale = scale
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+
+        # self.yarn(device)
+        self.revised_yarn(device)
+
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+
+        t = np.arange(self.max_seq_len_cached, dtype=np.float64)
+        t = torch.tensor(t, device=self.inv_freq.device, dtype=torch.float64)
+        freqs = torch.outer(t, self.inv_freq.to(device=t.device).to(t.dtype))
+        # t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        dtype = torch.get_default_dtype()
+
+        self.register_buffer("cos_cached", (emb.cos() * self.mscale)[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", (emb.sin() * self.mscale)[None, None, :, :].to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            print("*****notice******")
+            self.max_seq_len_cached = seq_len
+
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+
+            self.register_buffer("cos_cached", (emb.cos() * self.mscale)[None, None, :, :].to(x.dtype), persistent=False)
+            self.register_buffer("sin_cached", (emb.sin() * self.mscale)[None, None, :, :].to(x.dtype), persistent=False)
+        return (
+            self.cos_cached[:, :, :, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :, ...].to(dtype=x.dtype),
+        )
+
+    def yarn(self, device):
+        pos_freqs = self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (self.scale * pos_freqs)
+
+        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow, self.dim, self.base, self.original_max_position_embeddings)
+        inv_freq_mask = (1 - _yarn_linear_ramp_mask(low, high, self.dim // 2).float().to(device)) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation
+        inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.mscale = float(_yarn_get_mscale(self.scale) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation
+
+    def revised_yarn(self, device):
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+
+        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow, self.dim, self.base, self.original_max_position_embeddings)
+        inv_freq_mask = (1 - _yarn_linear_ramp_mask(low, high, self.dim // 2).float().to(device)) * self.extrapolation_factor
+
+        inv_freq = inv_freq / ((1-inv_freq_mask)*self.scale + inv_freq_mask)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.mscale = float(_yarn_get_mscale(self.scale) * self.attn_factor)
+
+        
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class MistralAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers"."""
+
+    def __init__(self, config: MistralConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = MistralRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = MistralLinearScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = MistralDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "vanilla_ntk":
+                self.rotary_emb = MistralVanillaNTKScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "yarn":
+                original_max_position_embeddings = self.config.rope_scaling["original_max_position_embeddings"]
+                self.rotary_emb = MistralYaRNScaledRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scale=scaling_factor, original_max_position_embeddings=original_max_position_embeddings
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        have_past_key_value = past_key_value is not None
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        
+        use_xformer = True
+
+        if not use_xformer or have_past_key_value:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
+                )
+
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+
+            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.size()}"
+                )
+
+            attn_output = attn_output.transpose(1, 2).contiguous()
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        else:
+            import xformers.ops as xops
+            attn_weights = None
+            #attn_bias = attention_mask.expand(-1, self.num_heads, -1, -1)
+            attn_bias=xops.LowerTriangularMask()
+            attn_output = xops.memory_efficient_attention(
+                query_states.transpose(1,2), key_states.transpose(1,2), value_states.transpose(1,2),
+                attn_bias=attn_bias,
+                ).reshape(bsz, q_len, self.hidden_size)
+
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: MistralConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MistralAttention(config=config)
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralPreTrainedModel(PreTrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MistralModel):
+            module.gradient_checkpointing = value
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralModel(MistralPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config: MistralConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([MistralDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length, sliding_window):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+
+            if sliding_window is not None:
+                combined_attention_mask = _make_sliding_window_causal_mask(
+                    input_shape,
+                    inputs_embeds.dtype,
+                    device=inputs_embeds.device,
+                    past_key_values_length=past_key_values_length,
+                    sliding_window=sliding_window,
+                )
+            else:
+                combined_attention_mask = _make_causal_mask(
+                    input_shape,
+                    inputs_embeds.dtype,
+                    device=inputs_embeds.device,
+                    past_key_values_length=past_key_values_length,
+                )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        # if attention_mask is None:
+        #     attention_mask = torch.ones(
+        #         (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+        #     )
+
+        if attention_mask is not None:
+            attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length,
+            sliding_window=self.config.sliding_window if hasattr(self.config, "sliding_window") else None,
+        )
+        # elif 0 in attention_mask:
+        #     padding_mask = attention_mask
+        # if (
+        #     padding_mask is not None
+        #     and hasattr(self.config, "_flash_attn_2_enabled")
+        #     and self.config._flash_attn_2_enabled
+        # ):
+        #     is_padding_right = padding_mask[:, -1].sum().item() != batch_size
+        #     if is_padding_right:
+        #         raise ValueError(
+        #             "You are attempting to perform batched generation with padding_side='right'"
+        #             " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+        #             " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+        #         )
+
+        # attention_mask = self._prepare_decoder_attention_mask(
+        #     attention_mask,
+        #     (batch_size, seq_length),
+        #     inputs_embeds,
+        #     past_key_values_length,
+        #     sliding_window=self.config.sliding_window if hasattr(self.config, "sliding_window") else None,
+        # )
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class MistralForCausalLM(MistralPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MistralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+class MistralForSequenceClassification(MistralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MistralModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
\ No newline at end of file
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000..33307a2
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000..8b443ef
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..6f0cfcc
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,52 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "</s>",
+  "use_default_system_prompt": true,
+  "use_fast": true
+}
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000..8d1eb92
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,1379 @@
+{
+  "best_metric": 2.0878255367279053,
+  "best_model_checkpoint": "path_to_your_model_and_data/pose/results/2k-32k-yarn/checkpoint-1000",
+  "epoch": 0.6896551724137931,
+  "eval_steps": 50,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 1.2041199826559246e-05,
+      "loss": 1.6869,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 1.9084850188786497e-05,
+      "loss": 1.6045,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 1.993939393939394e-05,
+      "loss": 1.5365,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 1.985858585858586e-05,
+      "loss": 1.6548,
+      "step": 20
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 1.975757575757576e-05,
+      "loss": 1.7008,
+      "step": 25
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 1.9656565656565658e-05,
+      "loss": 1.6702,
+      "step": 30
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 1.9555555555555557e-05,
+      "loss": 1.6943,
+      "step": 35
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 1.9454545454545457e-05,
+      "loss": 1.6775,
+      "step": 40
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 1.9353535353535356e-05,
+      "loss": 1.7281,
+      "step": 45
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 1.9252525252525252e-05,
+      "loss": 1.6746,
+      "step": 50
+    },
+    {
+      "epoch": 0.03,
+      "eval_loss": 2.0998010635375977,
+      "eval_runtime": 127.4969,
+      "eval_samples_per_second": 0.784,
+      "eval_steps_per_second": 0.102,
+      "step": 50
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 1.9151515151515152e-05,
+      "loss": 1.7754,
+      "step": 55
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 1.905050505050505e-05,
+      "loss": 1.7094,
+      "step": 60
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 1.894949494949495e-05,
+      "loss": 1.6483,
+      "step": 65
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 1.886868686868687e-05,
+      "loss": 1.7341,
+      "step": 70
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 1.876767676767677e-05,
+      "loss": 1.6221,
+      "step": 75
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 1.5681,
+      "step": 80
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 1.8565656565656568e-05,
+      "loss": 1.6668,
+      "step": 85
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 1.8464646464646464e-05,
+      "loss": 1.6743,
+      "step": 90
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 1.8363636363636367e-05,
+      "loss": 1.7875,
+      "step": 95
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 1.8262626262626263e-05,
+      "loss": 1.7397,
+      "step": 100
+    },
+    {
+      "epoch": 0.07,
+      "eval_loss": 2.1947836875915527,
+      "eval_runtime": 127.4557,
+      "eval_samples_per_second": 0.785,
+      "eval_steps_per_second": 0.102,
+      "step": 100
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 1.8161616161616163e-05,
+      "loss": 1.6912,
+      "step": 105
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 1.8060606060606062e-05,
+      "loss": 1.7995,
+      "step": 110
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 1.795959595959596e-05,
+      "loss": 1.6217,
+      "step": 115
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 1.785858585858586e-05,
+      "loss": 1.7081,
+      "step": 120
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 1.775757575757576e-05,
+      "loss": 1.7041,
+      "step": 125
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 1.765656565656566e-05,
+      "loss": 1.6235,
+      "step": 130
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 1.7555555555555556e-05,
+      "loss": 1.6583,
+      "step": 135
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 1.7454545454545456e-05,
+      "loss": 1.7467,
+      "step": 140
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 1.7353535353535355e-05,
+      "loss": 1.6531,
+      "step": 145
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 1.7252525252525255e-05,
+      "loss": 1.7272,
+      "step": 150
+    },
+    {
+      "epoch": 0.1,
+      "eval_loss": 2.1845450401306152,
+      "eval_runtime": 127.0346,
+      "eval_samples_per_second": 0.787,
+      "eval_steps_per_second": 0.102,
+      "step": 150
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 1.715151515151515e-05,
+      "loss": 1.6193,
+      "step": 155
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 1.7050505050505054e-05,
+      "loss": 1.6174,
+      "step": 160
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 1.694949494949495e-05,
+      "loss": 1.665,
+      "step": 165
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 1.684848484848485e-05,
+      "loss": 1.7045,
+      "step": 170
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 1.674747474747475e-05,
+      "loss": 1.7124,
+      "step": 175
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 1.6646464646464648e-05,
+      "loss": 1.8402,
+      "step": 180
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 1.6545454545454548e-05,
+      "loss": 1.6673,
+      "step": 185
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 1.6444444444444444e-05,
+      "loss": 1.7193,
+      "step": 190
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 1.6343434343434346e-05,
+      "loss": 1.6832,
+      "step": 195
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 1.6242424242424243e-05,
+      "loss": 1.6866,
+      "step": 200
+    },
+    {
+      "epoch": 0.14,
+      "eval_loss": 2.1831867694854736,
+      "eval_runtime": 127.2543,
+      "eval_samples_per_second": 0.786,
+      "eval_steps_per_second": 0.102,
+      "step": 200
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 1.6141414141414142e-05,
+      "loss": 1.6387,
+      "step": 205
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 1.604040404040404e-05,
+      "loss": 1.7822,
+      "step": 210
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 1.593939393939394e-05,
+      "loss": 1.6705,
+      "step": 215
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 1.585858585858586e-05,
+      "loss": 1.6592,
+      "step": 220
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 1.575757575757576e-05,
+      "loss": 1.6256,
+      "step": 225
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 1.565656565656566e-05,
+      "loss": 1.6946,
+      "step": 230
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 1.555555555555556e-05,
+      "loss": 1.6859,
+      "step": 235
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 1.5454545454545454e-05,
+      "loss": 1.674,
+      "step": 240
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 1.5353535353535354e-05,
+      "loss": 1.7687,
+      "step": 245
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 1.5252525252525255e-05,
+      "loss": 1.6456,
+      "step": 250
+    },
+    {
+      "epoch": 0.17,
+      "eval_loss": 2.1776535511016846,
+      "eval_runtime": 127.5296,
+      "eval_samples_per_second": 0.784,
+      "eval_steps_per_second": 0.102,
+      "step": 250
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 1.5151515151515153e-05,
+      "loss": 1.6896,
+      "step": 255
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 1.505050505050505e-05,
+      "loss": 1.7209,
+      "step": 260
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 1.4949494949494952e-05,
+      "loss": 1.66,
+      "step": 265
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.6708,
+      "step": 270
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 1.4747474747474747e-05,
+      "loss": 1.6432,
+      "step": 275
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 1.4646464646464649e-05,
+      "loss": 1.6676,
+      "step": 280
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 1.4545454545454546e-05,
+      "loss": 1.6768,
+      "step": 285
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 1.4444444444444446e-05,
+      "loss": 1.7592,
+      "step": 290
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 1.4343434343434344e-05,
+      "loss": 1.6192,
+      "step": 295
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 1.4242424242424245e-05,
+      "loss": 1.5604,
+      "step": 300
+    },
+    {
+      "epoch": 0.21,
+      "eval_loss": 2.1859655380249023,
+      "eval_runtime": 127.9048,
+      "eval_samples_per_second": 0.782,
+      "eval_steps_per_second": 0.102,
+      "step": 300
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 1.4141414141414143e-05,
+      "loss": 1.6379,
+      "step": 305
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 1.404040404040404e-05,
+      "loss": 1.6012,
+      "step": 310
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 1.3939393939393942e-05,
+      "loss": 1.7011,
+      "step": 315
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 1.383838383838384e-05,
+      "loss": 1.6168,
+      "step": 320
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 1.3737373737373739e-05,
+      "loss": 1.6507,
+      "step": 325
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 1.7676,
+      "step": 330
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 1.3535353535353538e-05,
+      "loss": 1.6707,
+      "step": 335
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 1.3434343434343436e-05,
+      "loss": 1.6062,
+      "step": 340
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.7136,
+      "step": 345
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 1.3232323232323234e-05,
+      "loss": 1.661,
+      "step": 350
+    },
+    {
+      "epoch": 0.24,
+      "eval_loss": 2.189072370529175,
+      "eval_runtime": 127.1833,
+      "eval_samples_per_second": 0.786,
+      "eval_steps_per_second": 0.102,
+      "step": 350
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 1.3131313131313132e-05,
+      "loss": 1.6766,
+      "step": 355
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 1.3030303030303032e-05,
+      "loss": 1.6497,
+      "step": 360
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 1.2929292929292931e-05,
+      "loss": 1.6738,
+      "step": 365
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 1.2828282828282829e-05,
+      "loss": 1.6673,
+      "step": 370
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 1.2727272727272728e-05,
+      "loss": 1.6767,
+      "step": 375
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 1.2626262626262626e-05,
+      "loss": 1.7335,
+      "step": 380
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 1.2525252525252527e-05,
+      "loss": 1.5885,
+      "step": 385
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 1.2424242424242425e-05,
+      "loss": 1.5992,
+      "step": 390
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 1.2323232323232323e-05,
+      "loss": 1.6115,
+      "step": 395
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 1.2222222222222224e-05,
+      "loss": 1.7397,
+      "step": 400
+    },
+    {
+      "epoch": 0.28,
+      "eval_loss": 2.1878244876861572,
+      "eval_runtime": 127.871,
+      "eval_samples_per_second": 0.782,
+      "eval_steps_per_second": 0.102,
+      "step": 400
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 1.2121212121212122e-05,
+      "loss": 1.6486,
+      "step": 405
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 1.2020202020202021e-05,
+      "loss": 1.6822,
+      "step": 410
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 1.191919191919192e-05,
+      "loss": 1.6529,
+      "step": 415
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 1.181818181818182e-05,
+      "loss": 1.6155,
+      "step": 420
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 1.1717171717171718e-05,
+      "loss": 1.6216,
+      "step": 425
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 1.1616161616161616e-05,
+      "loss": 1.5922,
+      "step": 430
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 1.1515151515151517e-05,
+      "loss": 1.7198,
+      "step": 435
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 1.1414141414141415e-05,
+      "loss": 1.5924,
+      "step": 440
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 1.1313131313131314e-05,
+      "loss": 1.6722,
+      "step": 445
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 1.1212121212121212e-05,
+      "loss": 1.5975,
+      "step": 450
+    },
+    {
+      "epoch": 0.31,
+      "eval_loss": 2.185619592666626,
+      "eval_runtime": 127.8889,
+      "eval_samples_per_second": 0.782,
+      "eval_steps_per_second": 0.102,
+      "step": 450
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 1.1111111111111113e-05,
+      "loss": 1.6218,
+      "step": 455
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 1.1010101010101011e-05,
+      "loss": 1.613,
+      "step": 460
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 1.0909090909090909e-05,
+      "loss": 1.7023,
+      "step": 465
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 1.080808080808081e-05,
+      "loss": 1.6097,
+      "step": 470
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 1.0707070707070708e-05,
+      "loss": 1.5922,
+      "step": 475
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 1.0606060606060606e-05,
+      "loss": 1.6736,
+      "step": 480
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 1.0505050505050507e-05,
+      "loss": 1.6033,
+      "step": 485
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 1.0404040404040405e-05,
+      "loss": 1.6552,
+      "step": 490
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.6533,
+      "step": 495
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 1.0202020202020202e-05,
+      "loss": 1.6899,
+      "step": 500
+    },
+    {
+      "epoch": 0.34,
+      "eval_loss": 2.1731319427490234,
+      "eval_runtime": 127.9869,
+      "eval_samples_per_second": 0.781,
+      "eval_steps_per_second": 0.102,
+      "step": 500
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 1.0101010101010103e-05,
+      "loss": 1.6272,
+      "step": 505
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 1e-05,
+      "loss": 1.6105,
+      "step": 510
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 9.8989898989899e-06,
+      "loss": 1.522,
+      "step": 515
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 9.797979797979798e-06,
+      "loss": 1.6022,
+      "step": 520
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 9.696969696969698e-06,
+      "loss": 1.6157,
+      "step": 525
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 9.595959595959597e-06,
+      "loss": 1.7323,
+      "step": 530
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 9.494949494949497e-06,
+      "loss": 1.5535,
+      "step": 535
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 9.393939393939396e-06,
+      "loss": 1.6378,
+      "step": 540
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 9.292929292929294e-06,
+      "loss": 1.5394,
+      "step": 545
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 9.191919191919193e-06,
+      "loss": 1.6862,
+      "step": 550
+    },
+    {
+      "epoch": 0.38,
+      "eval_loss": 2.1638994216918945,
+      "eval_runtime": 127.6725,
+      "eval_samples_per_second": 0.783,
+      "eval_steps_per_second": 0.102,
+      "step": 550
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.5748,
+      "step": 555
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 8.98989898989899e-06,
+      "loss": 1.6246,
+      "step": 560
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 1.6118,
+      "step": 565
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.5606,
+      "step": 570
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 8.686868686868687e-06,
+      "loss": 1.5704,
+      "step": 575
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 8.585858585858587e-06,
+      "loss": 1.6683,
+      "step": 580
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 8.484848484848486e-06,
+      "loss": 1.6215,
+      "step": 585
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 8.383838383838384e-06,
+      "loss": 1.6727,
+      "step": 590
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 8.282828282828283e-06,
+      "loss": 1.5615,
+      "step": 595
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 8.181818181818183e-06,
+      "loss": 1.6431,
+      "step": 600
+    },
+    {
+      "epoch": 0.41,
+      "eval_loss": 2.1520442962646484,
+      "eval_runtime": 127.6384,
+      "eval_samples_per_second": 0.783,
+      "eval_steps_per_second": 0.102,
+      "step": 600
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 8.08080808080808e-06,
+      "loss": 1.5967,
+      "step": 605
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 7.97979797979798e-06,
+      "loss": 1.6016,
+      "step": 610
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 7.87878787878788e-06,
+      "loss": 1.5905,
+      "step": 615
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 7.77777777777778e-06,
+      "loss": 1.6387,
+      "step": 620
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 7.676767676767677e-06,
+      "loss": 1.6255,
+      "step": 625
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 7.5757575757575764e-06,
+      "loss": 1.642,
+      "step": 630
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 7.474747474747476e-06,
+      "loss": 1.6004,
+      "step": 635
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 7.373737373737374e-06,
+      "loss": 1.5635,
+      "step": 640
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 7.272727272727273e-06,
+      "loss": 1.6266,
+      "step": 645
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 7.171717171717172e-06,
+      "loss": 1.5906,
+      "step": 650
+    },
+    {
+      "epoch": 0.45,
+      "eval_loss": 2.1461074352264404,
+      "eval_runtime": 126.7907,
+      "eval_samples_per_second": 0.789,
+      "eval_steps_per_second": 0.103,
+      "step": 650
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 7.070707070707071e-06,
+      "loss": 1.5335,
+      "step": 655
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.969696969696971e-06,
+      "loss": 1.5319,
+      "step": 660
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.868686868686869e-06,
+      "loss": 1.5643,
+      "step": 665
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.767676767676769e-06,
+      "loss": 1.635,
+      "step": 670
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 1.5881,
+      "step": 675
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 6.565656565656566e-06,
+      "loss": 1.5855,
+      "step": 680
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 6.464646464646466e-06,
+      "loss": 1.5281,
+      "step": 685
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 6.363636363636364e-06,
+      "loss": 1.5686,
+      "step": 690
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 6.262626262626264e-06,
+      "loss": 1.5815,
+      "step": 695
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 6.1616161616161615e-06,
+      "loss": 1.6277,
+      "step": 700
+    },
+    {
+      "epoch": 0.48,
+      "eval_loss": 2.1339261531829834,
+      "eval_runtime": 127.7156,
+      "eval_samples_per_second": 0.783,
+      "eval_steps_per_second": 0.102,
+      "step": 700
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 6.060606060606061e-06,
+      "loss": 1.6068,
+      "step": 705
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 5.95959595959596e-06,
+      "loss": 1.6653,
+      "step": 710
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 5.858585858585859e-06,
+      "loss": 1.5389,
+      "step": 715
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.5535,
+      "step": 720
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 5.656565656565657e-06,
+      "loss": 1.621,
+      "step": 725
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 5.555555555555557e-06,
+      "loss": 1.56,
+      "step": 730
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.4545454545454545e-06,
+      "loss": 1.5491,
+      "step": 735
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.353535353535354e-06,
+      "loss": 1.5151,
+      "step": 740
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.252525252525253e-06,
+      "loss": 1.5135,
+      "step": 745
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 5.151515151515152e-06,
+      "loss": 1.4665,
+      "step": 750
+    },
+    {
+      "epoch": 0.52,
+      "eval_loss": 2.127960681915283,
+      "eval_runtime": 126.9672,
+      "eval_samples_per_second": 0.788,
+      "eval_steps_per_second": 0.102,
+      "step": 750
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 5.0505050505050515e-06,
+      "loss": 1.5305,
+      "step": 755
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 4.94949494949495e-06,
+      "loss": 1.531,
+      "step": 760
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 4.848484848484849e-06,
+      "loss": 1.6151,
+      "step": 765
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 4.747474747474748e-06,
+      "loss": 1.5451,
+      "step": 770
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 4.646464646464647e-06,
+      "loss": 1.5876,
+      "step": 775
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.4907,
+      "step": 780
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 1.5307,
+      "step": 785
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 4.343434343434344e-06,
+      "loss": 1.4979,
+      "step": 790
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.5098,
+      "step": 795
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 4.141414141414142e-06,
+      "loss": 1.6528,
+      "step": 800
+    },
+    {
+      "epoch": 0.55,
+      "eval_loss": 2.1138248443603516,
+      "eval_runtime": 127.0916,
+      "eval_samples_per_second": 0.787,
+      "eval_steps_per_second": 0.102,
+      "step": 800
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.04040404040404e-06,
+      "loss": 1.5037,
+      "step": 805
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 3.93939393939394e-06,
+      "loss": 1.5584,
+      "step": 810
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 3.8383838383838385e-06,
+      "loss": 1.5794,
+      "step": 815
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 3.737373737373738e-06,
+      "loss": 1.4932,
+      "step": 820
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 3.6363636363636366e-06,
+      "loss": 1.5709,
+      "step": 825
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 3.5353535353535356e-06,
+      "loss": 1.5937,
+      "step": 830
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 3.4343434343434347e-06,
+      "loss": 1.5733,
+      "step": 835
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 1.5532,
+      "step": 840
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 3.232323232323233e-06,
+      "loss": 1.531,
+      "step": 845
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 3.131313131313132e-06,
+      "loss": 1.4618,
+      "step": 850
+    },
+    {
+      "epoch": 0.59,
+      "eval_loss": 2.104820489883423,
+      "eval_runtime": 126.9789,
+      "eval_samples_per_second": 0.788,
+      "eval_steps_per_second": 0.102,
+      "step": 850
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 3.0303030303030305e-06,
+      "loss": 1.5802,
+      "step": 855
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 2.9292929292929295e-06,
+      "loss": 1.4664,
+      "step": 860
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 2.8282828282828286e-06,
+      "loss": 1.6319,
+      "step": 865
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.5629,
+      "step": 870
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 2.6262626262626267e-06,
+      "loss": 1.5942,
+      "step": 875
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 2.5252525252525258e-06,
+      "loss": 1.5522,
+      "step": 880
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 2.4242424242424244e-06,
+      "loss": 1.5326,
+      "step": 885
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 2.3232323232323234e-06,
+      "loss": 1.5492,
+      "step": 890
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 1.483,
+      "step": 895
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 2.1212121212121216e-06,
+      "loss": 1.4871,
+      "step": 900
+    },
+    {
+      "epoch": 0.62,
+      "eval_loss": 2.096195936203003,
+      "eval_runtime": 127.2807,
+      "eval_samples_per_second": 0.786,
+      "eval_steps_per_second": 0.102,
+      "step": 900
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 2.02020202020202e-06,
+      "loss": 1.4819,
+      "step": 905
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 1.9191919191919192e-06,
+      "loss": 1.5326,
+      "step": 910
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 1.8181818181818183e-06,
+      "loss": 1.5267,
+      "step": 915
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 1.7171717171717173e-06,
+      "loss": 1.422,
+      "step": 920
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 1.6161616161616164e-06,
+      "loss": 1.5205,
+      "step": 925
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 1.5151515151515152e-06,
+      "loss": 1.5186,
+      "step": 930
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 1.4141414141414143e-06,
+      "loss": 1.5456,
+      "step": 935
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 1.3131313131313134e-06,
+      "loss": 1.5378,
+      "step": 940
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.4828,
+      "step": 945
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 1.111111111111111e-06,
+      "loss": 1.6019,
+      "step": 950
+    },
+    {
+      "epoch": 0.66,
+      "eval_loss": 2.090965509414673,
+      "eval_runtime": 126.05,
+      "eval_samples_per_second": 0.793,
+      "eval_steps_per_second": 0.103,
+      "step": 950
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 1.01010101010101e-06,
+      "loss": 1.4365,
+      "step": 955
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 9.090909090909091e-07,
+      "loss": 1.4373,
+      "step": 960
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 8.080808080808082e-07,
+      "loss": 1.5253,
+      "step": 965
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 7.070707070707071e-07,
+      "loss": 1.6014,
+      "step": 970
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 6.060606060606061e-07,
+      "loss": 1.549,
+      "step": 975
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 5.05050505050505e-07,
+      "loss": 1.5057,
+      "step": 980
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 4.040404040404041e-07,
+      "loss": 1.4688,
+      "step": 985
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 3.0303030303030305e-07,
+      "loss": 1.4982,
+      "step": 990
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 2.0202020202020205e-07,
+      "loss": 1.5032,
+      "step": 995
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 1.0101010101010103e-07,
+      "loss": 1.5359,
+      "step": 1000
+    },
+    {
+      "epoch": 0.69,
+      "eval_loss": 2.0878255367279053,
+      "eval_runtime": 126.9736,
+      "eval_samples_per_second": 0.788,
+      "eval_steps_per_second": 0.102,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1000,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "total_flos": 50751528222720.0,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000..5fcfc19
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:800fd3f2be062971c880ead3b73ddbd3d3d5ee56bef4666211ccf35bf9c3c4bb
+size 6651
diff --git a/zero_to_fp32.py b/zero_to_fp32.py
new file mode 100644
index 0000000..6b595ce
--- /dev/null
+++ b/zero_to_fp32.py
@@ -0,0 +1,578 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dicts.append(torch.load(f, map_location=device))
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)