commit 9a7835e1f25cc62d76850505bcf960867751d008 Author: ModelHub XC Date: Wed Jun 24 17:08:18 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..dda921d --- /dev/null +++ b/README.md @@ -0,0 +1,373 @@ +--- +library_name: transformers +license: apache-2.0 +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: distilabel-reasoning-R1-Llama-70B-ja-train + results: [] +datasets: +- lightblue/distilabel-reasoning-R1-Llama-70B +language: +- ja +--- + +[日本語はこちら](#japanese) + +# lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese + +[Deepseek's R1 models](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) are excellent, state-of-the-art reasoning models which have been trained to work bilingually, with English and Chinese. +However, these models are inconsistent in the language that they produce - often outputting Chinese or English when prompted in Japanese. +For this reason, we developed [lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese](https://huggingface.co/lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese) as a Japanese version of R1. + +This model is a Japanese fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) on our [lightblue/distilabel-reasoning-R1-Llama-70B](https://huggingface.co/datasets/lightblue/distilabel-reasoning-R1-Llama-70B) dataset which reliably and accurately outputs Japanese in response to prompts. + +This model was trained was trained for \<10 minutes on the 8 x L20 instance ([ecs.gn8is-8x.32xlarge](https://www.alibabacloud.com/help/en/ecs/user-guide/gpu-accelerated-compute-optimized-and-vgpu-accelerated-instance-families-1)) on [Alibaba Cloud](https://www.alibabacloud.com/). + +# How to use + +When using these models, we recommend using a sampling temperature of between 0.5-0.7, [as per the original distilled R1 models](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B#usage-recommendations). + +Additionally, we have observed that the model sometimes tends to repeat itself more than the original R1 model, so we also recommend setting `repetition_penalty` to 1.1, or higher if the model repeats itself when processing your prompts. + +We include scripts to use this model in vLLM: + + + +# Evaluation + +We evaluated this model for output accuracy and the percentage of valid Japanese `` sections using the first 50 rows of the [SakanaAI/gsm8k-ja-test_250-1319](https://huggingface.co/datasets/SakanaAI/gsm8k-ja-test_250-1319) dataset. + +We compare this to the original R1 model and test in both regimes where repetition penalty is 1.0 and 1.1: + +| | Repetition Penalty | Answer accuracy (%) | Valid Japanese `` (%) | +|------------------------------------------------|--------------------|---------------------|----------------------------| +| deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | 1.0 | 60 | 94 | +| deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | 1.1 | 62 | 96 | +| lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese | 1.0 | 66 | 92 | +| lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese | 1.1 | **70** | **98** | + +Code for the SakanaAI/gsm8k-ja-test_250-1319 evaluation can be found [here](https://drive.google.com/file/d/1gCzCJv5vasw8R3KVQimfoIDFyfxwxNvC/view?usp=sharing). + + +We further use the first 50 prompts from [DeL-TaiseiOzaki/Tengentoppa-sft-reasoning-ja](https://huggingface.co/datasets/DeL-TaiseiOzaki/Tengentoppa-sft-reasoning-ja) to evaluate the percentage of valid Japanese `` sections in model responses. +This benchmark contains more varied and complex prompts, meaning this is a more realistic evaluation of how reliably this model can output Japanese. + +| | Repetition Penalty | Valid Japanese `` (%) | +|------------------------------------------------|--------------------|----------------------------| +| deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | 1.0 | 48 | +| deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | 1.1 | 48 | +| lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese | 1.0 | 84 | +| lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese | 1.1 | **94** | + +Code for the DeL-TaiseiOzaki/Tengentoppa-sft-reasoning-ja evaluation can be found [here](https://drive.google.com/file/d/1f75IM5x1SZrb300odkEsLMfKsfibrxvR/view?usp=sharing). + +# How this model was made + +We made the data for this model using the following steps: + +1. Sample English reasoning-style prompts from [argilla/distilabel-reasoning-prompts](https://huggingface.co/datasets/argilla/distilabel-reasoning-prompts). +2. Remove similar prompts using text similarity based on [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) embeddings. +3. Translate English prompts to Japanese using [gpt-4o-mini-2024-07-18](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/). +4. Generate answers to prompts using [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B). +5. Filter out responses which did not: + * Finish within 2048 tokens + * Contain a valid `` section + * Have the `` section written in Japanese + +We used this data to train our model using supervised fine tuning on [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) with the [ecs.gn8is-8x.32xlarge](https://www.alibabacloud.com/help/en/ecs/user-guide/gpu-accelerated-compute-optimized-and-vgpu-accelerated-instance-families-1) instance. + + +
+
+

日本語

+ +[DeepseekのR1モデル](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)は優れた、最先端の推論モデルであり、英語と中国語のバイリンガルで動作するように訓練されています。しかし、これらのモデルは出力される言語が一貫していないことがあり、日本語でプロンプトを与えると中国語や英語を出力することがあります。そこで、我々はR1の日本語版として[lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese](https://huggingface.co/lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese)を開発しました。 + +このモデルは、我々の[lightblue/distilabel-reasoning-R1-Llama-70B](https://huggingface.co/datasets/lightblue/distilabel-reasoning-R1-Llama-70B) データセットを使用して、[deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)の日本語版として微調整されています。 + +このモデルは、[Alibaba Cloud](https://www.alibabacloud.com/)の8 x L20インスタンス([ecs.gn8is-8x.32xlarge](https://www.alibabacloud.com/help/en/ecs/user-guide/gpu-accelerated-compute-optimized-and-vgpu-accelerated-instance-families-1))で\<10分間訓練されました。 + +# 使用方法 + +これらのモデルを使用する際は、元の蒸留R1モデルで推奨されているように、サンプリング温度を0.5から0.7の間で使用することをお勧めします。 + +また、モデルが元のR1モデルよりも繰り返しがちな傾向があるため、プロンプトを処理する際にモデルが自分を繰り返す場合は、`repetition_penalty`を1.1またはそれ以上に設定することをお勧めします。 + +このモデルをvLLMで使用するためのスクリプトを含めています: + +
    +
  • vLLM + +[vLLM](https://github.com/vllm-project/vllm/)をインストールするには、 `pip install vllm`を使用します。 + +
    + vLLMコードを表示 + +```python +from vllm import LLM, SamplingParams + +llm = LLM( + model="lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese", + max_model_len=8_000 +) + +sampling_params = SamplingParams( + temperature=0.5, + max_tokens=8_000, + repetition_penalty=1.1 +) + +prompts = [ + """学校には1クラスにつき20人の生徒がおり、クラスは合計3つあります。 +学校全体では男子と女子がそれぞれ50%ずついます。 +1つ目のクラスには女子が15人、2つ目のクラスには女子が12人います。 +3つ目のクラスには何人の男子がいますか?""" +] + +conversations = [ + [{"role": "user", "content": x}] for x in prompts +] + +outputs = llm.chat(conversations, sampling_params=sampling_params) + +for output in outputs: + print(output.outputs[0].text) + + +# まず、学校の総生徒数を算出します。各クラスに20人の生徒があり、クラスは3つあるため、総生徒数は60人です。 + +# 次に、学校全体で男子と女子は同じ人数で分布しています。したがって、男子と女子各有30人。 +... +# したがって、3つ目のクラスの男子数は20 - 3 = 17人です。 +# + +# **解答:** + +# 学校の総生徒数を算出します。 +... +# **最終的な答え:** +# \[ +# \boxed{17} +# \] +``` + +
  • +
+ +# 評価 + +このモデルは、(SakanaAI/gsm8k-ja-test_250-1319)[https://huggingface.co/datasets/SakanaAI/gsm8k-ja-test_250-1319]データセットの最初の50行を使用して、出力の正確性と有効な日本語の``セクションの割合を評価しました。 + +これは元のR1モデルと比較し、繰り返しペナルティが1.0と1.1の両方の条件でテストを行いました: + +| | Repetition Penalty | Answer accuracy (%) | Valid Japanese `` (%) | +|------------------------------------------------|--------------------|---------------------|----------------------------| +| deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | 1.0 | 60 | 94 | +| deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | 1.1 | 62 | 96 | +| lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese | 1.0 | 66 | 92 | +| lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese | 1.1 | 70 | 98 | + +SakanaAI/gsm8k-ja-test_250-1319の評価コードは[こちら](https://drive.google.com/file/d/1gCzCJv5vasw8R3KVQimfoIDFyfxwxNvC/view?usp=sharing)にあります。 + +さらに、(DeL-TaiseiOzaki/Tengentoppa-sft-reasoning-ja)[https://huggingface.co/datasets/DeL-TaiseiOzaki/Tengentoppa-sft-reasoning-ja]の最初の50プロンプトを使用して、モデル応答における有効な日本語の``セクションの割合を評価します。このベンチマークにはより多様で複雑なプロンプトが含まれており、モデルが日本語を信頼性高く出力できるかどうかを、より現実的に評価します。 + +| | Repetition Penalty | Valid Japanese `` (%) | +|------------------------------------------------|--------------------|----------------------------| +| deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | 1.0 | 48 | +| deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | 1.1 | 48 | +| lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese | 1.0 | 84 | +| lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese | 1.1 | 94 | + +DeL-TaiseiOzaki/Tengentoppa-sft-reasoning-ja評価コードは[こちら](https://drive.google.com/file/d/1f75IM5x1SZrb300odkEsLMfKsfibrxvR/view?usp=sharing)にあります。 + +# 作成方法 + +このモデルのデータは以下の手順で作成されました: + +1. [argilla/distilabel-reasoning-prompts](https://huggingface.co/datasets/argilla/distilabel-reasoning-prompts)から英語の推論スタイルのプロンプトをサンプルします。 +2. [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)埋め込みに基づくテキスト類似度を使用して、類似したプロンプトを削除します。 +3. [gpt-4o-mini-2024-07-18](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/)を使用して、英語のプロンプトを日本語に翻訳します。 +4. [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)を使用して、プロンプトに対する回答を生成します。 +5. 以下の条件を満たさない応答をフィルタリングします: + * 2048トークン以内に終了すること + * 有効な``セクションを含んでいること + * ``セクションが日本語で書かれていること + + +# Training details +
+ Full training config + + ### Training config yaml + + ```yaml +### model +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B + +### method +stage: sft +do_train: true +finetuning_type: full +deepspeed: /root/LLaMA-Factory/examples/deepspeed/ds_z2_config.json + +### dataset +dataset: distilabel-reasoning-R1-Llama-70B-ja-train +template: qwen +cutoff_len: 4500 +overwrite_cache: true +preprocessing_num_workers: 16 +packing: true + +### output +output_dir: /root/train_outputs/DeepSeek-R1-Distill-Qwen-7B/distilabel-reasoning-R1-Llama-70B-ja-train +logging_steps: 1 +save_steps: 0.99999 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 1 +learning_rate: 1.0e-5 +num_train_epochs: 1.0 +lr_scheduler_type: cosine +warmup_ratio: 0.01 +bf16: true +ddp_timeout: 180000000 + +### eval +val_size: 0.01 +per_device_eval_batch_size: 1 +eval_strategy: steps +eval_steps: 0.1 +``` + +### Training run script + +```shell +echo '{ + "distilabel-reasoning-R1-Llama-70B-ja-train": { + "hf_hub_url": "lightblue/distilabel-reasoning-R1-Llama-70B-ja-train", + "formatting": "sharegpt" + } +}' > /root/LLaMA-Factory/data/dataset_info.json + +cd /root/LLaMA-Factory && llamafactory-cli train /root/reasoning_train.yaml + +rm -r /root/train_outputs/DeepSeek-R1-Distill-Qwen-7B/distilabel-reasoning-R1-Llama-70B-ja-train/checkpoint* +huggingface-cli upload lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese /root/train_outputs/DeepSeek-R1-Distill-Qwen-7B/distilabel-reasoning-R1-Llama-70B-ja-train +``` + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 1 +- eval_batch_size: 1 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- total_train_batch_size: 8 +- total_eval_batch_size: 8 +- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.01 +- num_epochs: 1.0 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:----:|:---------------:| +| 0.766 | 0.1087 | 5 | 0.5912 | +| 0.5873 | 0.2174 | 10 | 0.5282 | +| 0.3868 | 0.3261 | 15 | 0.4958 | +| 0.5101 | 0.4348 | 20 | 0.4761 | +| 0.4085 | 0.5435 | 25 | 0.4644 | +| 0.5561 | 0.6522 | 30 | 0.4578 | +| 0.4683 | 0.7609 | 35 | 0.4542 | +| 0.5055 | 0.8696 | 40 | 0.4526 | +| 0.5359 | 0.9783 | 45 | 0.4519 | + + +### Framework versions + +- Transformers 4.46.1 +- Pytorch 2.5.1+cu124 +- Datasets 3.1.0 +- Tokenizers 0.20.3 +
+ +
+ +# License + +We share this model under an Apache 2.0 license. + +# Developed by + + +Lightblue technology logo + + +This model was trained by Peter Devine ([ptrdvn](https://huggingface.co/ptrdvn)) for Lightblue diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..854cda0 --- /dev/null +++ b/all_results.json @@ -0,0 +1,12 @@ +{ + "epoch": 1.0, + "eval_loss": 0.4519386291503906, + "eval_runtime": 0.8439, + "eval_samples_per_second": 4.74, + "eval_steps_per_second": 1.185, + "total_flos": 7.03161217014825e+16, + "train_loss": 0.50457603516786, + "train_runtime": 362.0803, + "train_samples_per_second": 1.016, + "train_steps_per_second": 0.127 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..8d7c21d --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..20f2ee7 --- /dev/null +++ b/eval_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 1.0, + "eval_loss": 0.4519386291503906, + "eval_runtime": 0.8439, + "eval_samples_per_second": 4.74, + "eval_steps_per_second": 1.185 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..682e597 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.46.1" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..9016f30 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58d9c77e07dfdbe8cfd0597028134f10fbd1d5a530b7fddcb16c0d2917cc631 +size 4877660776 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..cecf883 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c32a8421c2ce3d455a55c744697d8aeb80e7454910a34825bab37c0c57abdd3 +size 4932751008 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..50e715b --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5925fa8c3c2c35e349b7e0f61bdf82813b028c4094ecc7b7b42965d51b98d78e +size 4330865200 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..7f18bcc --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa7af91a9bdcda2622a26a53532a02cdf2ca98374fd503737562a1a8af8f4e9 +size 1089994880 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..6ca5084 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 15231233024 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..71e14b3 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..b6bc6a8 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02643f00207dfc5ed248992486bde04314c21dca556bf65ce520690962b8db63 +size 11422965 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..528ce85 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,204 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '
' in content %}{% set content = content.split('
')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "padding_side": "right", + "sp_model_kwargs": {}, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..55efa28 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 7.03161217014825e+16, + "train_loss": 0.50457603516786, + "train_runtime": 362.0803, + "train_samples_per_second": 1.016, + "train_steps_per_second": 0.127 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..a694f12 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,56 @@ +{"current_steps": 1, "total_steps": 46, "loss": 0.706, "lr": 1e-05, "epoch": 0.021739130434782608, "percentage": 2.17, "elapsed_time": "0:00:07", "remaining_time": "0:05:39"} +{"current_steps": 2, "total_steps": 46, "loss": 0.643, "lr": 9.987820251299121e-06, "epoch": 0.043478260869565216, "percentage": 4.35, "elapsed_time": "0:00:14", "remaining_time": "0:05:09"} +{"current_steps": 3, "total_steps": 46, "loss": 0.722, "lr": 9.951340343707852e-06, "epoch": 0.06521739130434782, "percentage": 6.52, "elapsed_time": "0:00:20", "remaining_time": "0:04:53"} +{"current_steps": 4, "total_steps": 46, "loss": 0.6543, "lr": 9.890738003669029e-06, "epoch": 0.08695652173913043, "percentage": 8.7, "elapsed_time": "0:00:27", "remaining_time": "0:04:45"} +{"current_steps": 5, "total_steps": 46, "loss": 0.766, "lr": 9.806308479691595e-06, "epoch": 0.10869565217391304, "percentage": 10.87, "elapsed_time": "0:00:33", "remaining_time": "0:04:36"} +{"current_steps": 5, "total_steps": 46, "eval_loss": 0.5912319421768188, "epoch": 0.10869565217391304, "percentage": 10.87, "elapsed_time": "0:00:34", "remaining_time": "0:04:43"} +{"current_steps": 6, "total_steps": 46, "loss": 0.5495, "lr": 9.698463103929542e-06, "epoch": 0.13043478260869565, "percentage": 13.04, "elapsed_time": "0:00:41", "remaining_time": "0:04:34"} +{"current_steps": 7, "total_steps": 46, "loss": 0.5193, "lr": 9.567727288213005e-06, "epoch": 0.15217391304347827, "percentage": 15.22, "elapsed_time": "0:00:47", "remaining_time": "0:04:24"} +{"current_steps": 8, "total_steps": 46, "loss": 0.5578, "lr": 9.414737964294636e-06, "epoch": 0.17391304347826086, "percentage": 17.39, "elapsed_time": "0:00:54", "remaining_time": "0:04:17"} +{"current_steps": 9, "total_steps": 46, "loss": 0.3643, "lr": 9.24024048078213e-06, "epoch": 0.1956521739130435, "percentage": 19.57, "elapsed_time": "0:01:00", "remaining_time": "0:04:09"} +{"current_steps": 10, "total_steps": 46, "loss": 0.5873, "lr": 9.045084971874738e-06, "epoch": 0.21739130434782608, "percentage": 21.74, "elapsed_time": "0:01:07", "remaining_time": "0:04:02"} +{"current_steps": 10, "total_steps": 46, "eval_loss": 0.5282274484634399, "epoch": 0.21739130434782608, "percentage": 21.74, "elapsed_time": "0:01:08", "remaining_time": "0:04:05"} +{"current_steps": 11, "total_steps": 46, "loss": 0.6398, "lr": 8.83022221559489e-06, "epoch": 0.2391304347826087, "percentage": 23.91, "elapsed_time": "0:01:14", "remaining_time": "0:03:57"} +{"current_steps": 12, "total_steps": 46, "loss": 0.4296, "lr": 8.596699001693257e-06, "epoch": 0.2608695652173913, "percentage": 26.09, "elapsed_time": "0:01:21", "remaining_time": "0:03:49"} +{"current_steps": 13, "total_steps": 46, "loss": 0.5244, "lr": 8.345653031794292e-06, "epoch": 0.2826086956521739, "percentage": 28.26, "elapsed_time": "0:01:27", "remaining_time": "0:03:42"} +{"current_steps": 14, "total_steps": 46, "loss": 0.4739, "lr": 8.078307376628292e-06, "epoch": 0.30434782608695654, "percentage": 30.43, "elapsed_time": "0:01:34", "remaining_time": "0:03:34"} +{"current_steps": 15, "total_steps": 46, "loss": 0.3868, "lr": 7.795964517353734e-06, "epoch": 0.32608695652173914, "percentage": 32.61, "elapsed_time": "0:01:40", "remaining_time": "0:03:27"} +{"current_steps": 15, "total_steps": 46, "eval_loss": 0.49576932191848755, "epoch": 0.32608695652173914, "percentage": 32.61, "elapsed_time": "0:01:41", "remaining_time": "0:03:29"} +{"current_steps": 16, "total_steps": 46, "loss": 0.5849, "lr": 7.500000000000001e-06, "epoch": 0.34782608695652173, "percentage": 34.78, "elapsed_time": "0:01:47", "remaining_time": "0:03:22"} +{"current_steps": 17, "total_steps": 46, "loss": 0.4854, "lr": 7.191855733945388e-06, "epoch": 0.3695652173913043, "percentage": 36.96, "elapsed_time": "0:01:54", "remaining_time": "0:03:15"} +{"current_steps": 18, "total_steps": 46, "loss": 0.3887, "lr": 6.873032967079562e-06, "epoch": 0.391304347826087, "percentage": 39.13, "elapsed_time": "0:02:00", "remaining_time": "0:03:08"} +{"current_steps": 19, "total_steps": 46, "loss": 0.6612, "lr": 6.545084971874738e-06, "epoch": 0.41304347826086957, "percentage": 41.3, "elapsed_time": "0:02:07", "remaining_time": "0:03:01"} +{"current_steps": 20, "total_steps": 46, "loss": 0.5101, "lr": 6.209609477998339e-06, "epoch": 0.43478260869565216, "percentage": 43.48, "elapsed_time": "0:02:14", "remaining_time": "0:02:54"} +{"current_steps": 20, "total_steps": 46, "eval_loss": 0.4761270582675934, "epoch": 0.43478260869565216, "percentage": 43.48, "elapsed_time": "0:02:14", "remaining_time": "0:02:55"} +{"current_steps": 21, "total_steps": 46, "loss": 0.4696, "lr": 5.8682408883346535e-06, "epoch": 0.45652173913043476, "percentage": 45.65, "elapsed_time": "0:02:21", "remaining_time": "0:02:48"} +{"current_steps": 22, "total_steps": 46, "loss": 0.4555, "lr": 5.522642316338268e-06, "epoch": 0.4782608695652174, "percentage": 47.83, "elapsed_time": "0:02:27", "remaining_time": "0:02:41"} +{"current_steps": 23, "total_steps": 46, "loss": 0.4064, "lr": 5.174497483512506e-06, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:02:34", "remaining_time": "0:02:34"} +{"current_steps": 24, "total_steps": 46, "loss": 0.3378, "lr": 4.825502516487497e-06, "epoch": 0.5217391304347826, "percentage": 52.17, "elapsed_time": "0:02:41", "remaining_time": "0:02:27"} +{"current_steps": 25, "total_steps": 46, "loss": 0.4085, "lr": 4.477357683661734e-06, "epoch": 0.5434782608695652, "percentage": 54.35, "elapsed_time": "0:02:47", "remaining_time": "0:02:20"} +{"current_steps": 25, "total_steps": 46, "eval_loss": 0.46437764167785645, "epoch": 0.5434782608695652, "percentage": 54.35, "elapsed_time": "0:02:48", "remaining_time": "0:02:21"} +{"current_steps": 26, "total_steps": 46, "loss": 0.4565, "lr": 4.131759111665349e-06, "epoch": 0.5652173913043478, "percentage": 56.52, "elapsed_time": "0:02:55", "remaining_time": "0:02:14"} +{"current_steps": 27, "total_steps": 46, "loss": 0.613, "lr": 3.790390522001662e-06, "epoch": 0.5869565217391305, "percentage": 58.7, "elapsed_time": "0:03:01", "remaining_time": "0:02:07"} +{"current_steps": 28, "total_steps": 46, "loss": 0.4919, "lr": 3.4549150281252635e-06, "epoch": 0.6086956521739131, "percentage": 60.87, "elapsed_time": "0:03:08", "remaining_time": "0:02:00"} +{"current_steps": 29, "total_steps": 46, "loss": 0.4456, "lr": 3.12696703292044e-06, "epoch": 0.6304347826086957, "percentage": 63.04, "elapsed_time": "0:03:14", "remaining_time": "0:01:54"} +{"current_steps": 30, "total_steps": 46, "loss": 0.5561, "lr": 2.8081442660546126e-06, "epoch": 0.6521739130434783, "percentage": 65.22, "elapsed_time": "0:03:21", "remaining_time": "0:01:47"} +{"current_steps": 30, "total_steps": 46, "eval_loss": 0.45777273178100586, "epoch": 0.6521739130434783, "percentage": 65.22, "elapsed_time": "0:03:21", "remaining_time": "0:01:47"} +{"current_steps": 31, "total_steps": 46, "loss": 0.4395, "lr": 2.5000000000000015e-06, "epoch": 0.6739130434782609, "percentage": 67.39, "elapsed_time": "0:03:28", "remaining_time": "0:01:40"} +{"current_steps": 32, "total_steps": 46, "loss": 0.4492, "lr": 2.204035482646267e-06, "epoch": 0.6956521739130435, "percentage": 69.57, "elapsed_time": "0:03:35", "remaining_time": "0:01:34"} +{"current_steps": 33, "total_steps": 46, "loss": 0.389, "lr": 1.9216926233717087e-06, "epoch": 0.717391304347826, "percentage": 71.74, "elapsed_time": "0:03:41", "remaining_time": "0:01:27"} +{"current_steps": 34, "total_steps": 46, "loss": 0.4336, "lr": 1.6543469682057105e-06, "epoch": 0.7391304347826086, "percentage": 73.91, "elapsed_time": "0:03:48", "remaining_time": "0:01:20"} +{"current_steps": 35, "total_steps": 46, "loss": 0.4683, "lr": 1.4033009983067454e-06, "epoch": 0.7608695652173914, "percentage": 76.09, "elapsed_time": "0:03:54", "remaining_time": "0:01:13"} +{"current_steps": 35, "total_steps": 46, "eval_loss": 0.45417019724845886, "epoch": 0.7608695652173914, "percentage": 76.09, "elapsed_time": "0:03:55", "remaining_time": "0:01:14"} +{"current_steps": 36, "total_steps": 46, "loss": 0.4277, "lr": 1.1697777844051105e-06, "epoch": 0.782608695652174, "percentage": 78.26, "elapsed_time": "0:04:01", "remaining_time": "0:01:07"} +{"current_steps": 37, "total_steps": 46, "loss": 0.4057, "lr": 9.549150281252633e-07, "epoch": 0.8043478260869565, "percentage": 80.43, "elapsed_time": "0:04:08", "remaining_time": "0:01:00"} +{"current_steps": 38, "total_steps": 46, "loss": 0.5928, "lr": 7.597595192178702e-07, "epoch": 0.8260869565217391, "percentage": 82.61, "elapsed_time": "0:04:14", "remaining_time": "0:00:53"} +{"current_steps": 39, "total_steps": 46, "loss": 0.5955, "lr": 5.852620357053651e-07, "epoch": 0.8478260869565217, "percentage": 84.78, "elapsed_time": "0:04:21", "remaining_time": "0:00:46"} +{"current_steps": 40, "total_steps": 46, "loss": 0.5055, "lr": 4.322727117869951e-07, "epoch": 0.8695652173913043, "percentage": 86.96, "elapsed_time": "0:04:28", "remaining_time": "0:00:40"} +{"current_steps": 40, "total_steps": 46, "eval_loss": 0.452594518661499, "epoch": 0.8695652173913043, "percentage": 86.96, "elapsed_time": "0:04:28", "remaining_time": "0:00:40"} +{"current_steps": 41, "total_steps": 46, "loss": 0.4106, "lr": 3.015368960704584e-07, "epoch": 0.8913043478260869, "percentage": 89.13, "elapsed_time": "0:04:35", "remaining_time": "0:00:33"} +{"current_steps": 42, "total_steps": 46, "loss": 0.4183, "lr": 1.9369152030840553e-07, "epoch": 0.9130434782608695, "percentage": 91.3, "elapsed_time": "0:04:41", "remaining_time": "0:00:26"} +{"current_steps": 43, "total_steps": 46, "loss": 0.4416, "lr": 1.0926199633097156e-07, "epoch": 0.9347826086956522, "percentage": 93.48, "elapsed_time": "0:04:48", "remaining_time": "0:00:20"} +{"current_steps": 44, "total_steps": 46, "loss": 0.4899, "lr": 4.865965629214819e-08, "epoch": 0.9565217391304348, "percentage": 95.65, "elapsed_time": "0:04:54", "remaining_time": "0:00:13"} +{"current_steps": 45, "total_steps": 46, "loss": 0.5359, "lr": 1.2179748700879013e-08, "epoch": 0.9782608695652174, "percentage": 97.83, "elapsed_time": "0:05:01", "remaining_time": "0:00:06"} +{"current_steps": 45, "total_steps": 46, "eval_loss": 0.45189881324768066, "epoch": 0.9782608695652174, "percentage": 97.83, "elapsed_time": "0:05:02", "remaining_time": "0:00:06"} +{"current_steps": 46, "total_steps": 46, "loss": 0.4124, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:05:08", "remaining_time": "0:00:00"} +{"current_steps": 46, "total_steps": 46, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:06:00", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..391232e --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,436 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 5, + "global_step": 46, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.021739130434782608, + "grad_norm": 2.5425055027008057, + "learning_rate": 1e-05, + "loss": 0.706, + "step": 1 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 2.3038032054901123, + "learning_rate": 9.987820251299121e-06, + "loss": 0.643, + "step": 2 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 2.222012996673584, + "learning_rate": 9.951340343707852e-06, + "loss": 0.722, + "step": 3 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 1.4986803531646729, + "learning_rate": 9.890738003669029e-06, + "loss": 0.6543, + "step": 4 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 1.5336999893188477, + "learning_rate": 9.806308479691595e-06, + "loss": 0.766, + "step": 5 + }, + { + "epoch": 0.10869565217391304, + "eval_loss": 0.5912319421768188, + "eval_runtime": 0.8443, + "eval_samples_per_second": 4.737, + "eval_steps_per_second": 1.184, + "step": 5 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 1.4662328958511353, + "learning_rate": 9.698463103929542e-06, + "loss": 0.5495, + "step": 6 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 1.3746005296707153, + "learning_rate": 9.567727288213005e-06, + "loss": 0.5193, + "step": 7 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 1.4526853561401367, + "learning_rate": 9.414737964294636e-06, + "loss": 0.5578, + "step": 8 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 0.8964347243309021, + "learning_rate": 9.24024048078213e-06, + "loss": 0.3643, + "step": 9 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 1.2126158475875854, + "learning_rate": 9.045084971874738e-06, + "loss": 0.5873, + "step": 10 + }, + { + "epoch": 0.21739130434782608, + "eval_loss": 0.5282274484634399, + "eval_runtime": 0.843, + "eval_samples_per_second": 4.745, + "eval_steps_per_second": 1.186, + "step": 10 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 1.2183283567428589, + "learning_rate": 8.83022221559489e-06, + "loss": 0.6398, + "step": 11 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.9250560402870178, + "learning_rate": 8.596699001693257e-06, + "loss": 0.4296, + "step": 12 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 1.0050208568572998, + "learning_rate": 8.345653031794292e-06, + "loss": 0.5244, + "step": 13 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 1.0231624841690063, + "learning_rate": 8.078307376628292e-06, + "loss": 0.4739, + "step": 14 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 0.8328154683113098, + "learning_rate": 7.795964517353734e-06, + "loss": 0.3868, + "step": 15 + }, + { + "epoch": 0.32608695652173914, + "eval_loss": 0.49576932191848755, + "eval_runtime": 0.8436, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 1.185, + "step": 15 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 1.0239394903182983, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5849, + "step": 16 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 0.9168555736541748, + "learning_rate": 7.191855733945388e-06, + "loss": 0.4854, + "step": 17 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.9247157573699951, + "learning_rate": 6.873032967079562e-06, + "loss": 0.3887, + "step": 18 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 1.1805756092071533, + "learning_rate": 6.545084971874738e-06, + "loss": 0.6612, + "step": 19 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.9535987377166748, + "learning_rate": 6.209609477998339e-06, + "loss": 0.5101, + "step": 20 + }, + { + "epoch": 0.43478260869565216, + "eval_loss": 0.4761270582675934, + "eval_runtime": 0.8437, + "eval_samples_per_second": 4.741, + "eval_steps_per_second": 1.185, + "step": 20 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 0.821264922618866, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.4696, + "step": 21 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.9166697263717651, + "learning_rate": 5.522642316338268e-06, + "loss": 0.4555, + "step": 22 + }, + { + "epoch": 0.5, + "grad_norm": 0.7674450278282166, + "learning_rate": 5.174497483512506e-06, + "loss": 0.4064, + "step": 23 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.7370434999465942, + "learning_rate": 4.825502516487497e-06, + "loss": 0.3378, + "step": 24 + }, + { + "epoch": 0.5434782608695652, + "grad_norm": 0.8126187324523926, + "learning_rate": 4.477357683661734e-06, + "loss": 0.4085, + "step": 25 + }, + { + "epoch": 0.5434782608695652, + "eval_loss": 0.46437764167785645, + "eval_runtime": 0.8435, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 1.186, + "step": 25 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.8102895021438599, + "learning_rate": 4.131759111665349e-06, + "loss": 0.4565, + "step": 26 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 1.0004063844680786, + "learning_rate": 3.790390522001662e-06, + "loss": 0.613, + "step": 27 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.8794491291046143, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.4919, + "step": 28 + }, + { + "epoch": 0.6304347826086957, + "grad_norm": 0.7904605865478516, + "learning_rate": 3.12696703292044e-06, + "loss": 0.4456, + "step": 29 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.8982527256011963, + "learning_rate": 2.8081442660546126e-06, + "loss": 0.5561, + "step": 30 + }, + { + "epoch": 0.6521739130434783, + "eval_loss": 0.45777273178100586, + "eval_runtime": 0.8438, + "eval_samples_per_second": 4.741, + "eval_steps_per_second": 1.185, + "step": 30 + }, + { + "epoch": 0.6739130434782609, + "grad_norm": 0.7891985774040222, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.4395, + "step": 31 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.7818904519081116, + "learning_rate": 2.204035482646267e-06, + "loss": 0.4492, + "step": 32 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 0.8261966705322266, + "learning_rate": 1.9216926233717087e-06, + "loss": 0.389, + "step": 33 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.8626196384429932, + "learning_rate": 1.6543469682057105e-06, + "loss": 0.4336, + "step": 34 + }, + { + "epoch": 0.7608695652173914, + "grad_norm": 0.8142871856689453, + "learning_rate": 1.4033009983067454e-06, + "loss": 0.4683, + "step": 35 + }, + { + "epoch": 0.7608695652173914, + "eval_loss": 0.45417019724845886, + "eval_runtime": 0.8436, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 1.185, + "step": 35 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.8224750757217407, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.4277, + "step": 36 + }, + { + "epoch": 0.8043478260869565, + "grad_norm": 1.101127028465271, + "learning_rate": 9.549150281252633e-07, + "loss": 0.4057, + "step": 37 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.9483347535133362, + "learning_rate": 7.597595192178702e-07, + "loss": 0.5928, + "step": 38 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 0.9000388383865356, + "learning_rate": 5.852620357053651e-07, + "loss": 0.5955, + "step": 39 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.8246148228645325, + "learning_rate": 4.322727117869951e-07, + "loss": 0.5055, + "step": 40 + }, + { + "epoch": 0.8695652173913043, + "eval_loss": 0.452594518661499, + "eval_runtime": 0.8425, + "eval_samples_per_second": 4.748, + "eval_steps_per_second": 1.187, + "step": 40 + }, + { + "epoch": 0.8913043478260869, + "grad_norm": 0.7705091834068298, + "learning_rate": 3.015368960704584e-07, + "loss": 0.4106, + "step": 41 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.7705535888671875, + "learning_rate": 1.9369152030840553e-07, + "loss": 0.4183, + "step": 42 + }, + { + "epoch": 0.9347826086956522, + "grad_norm": 0.78244948387146, + "learning_rate": 1.0926199633097156e-07, + "loss": 0.4416, + "step": 43 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.8442041277885437, + "learning_rate": 4.865965629214819e-08, + "loss": 0.4899, + "step": 44 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 0.8914652466773987, + "learning_rate": 1.2179748700879013e-08, + "loss": 0.5359, + "step": 45 + }, + { + "epoch": 0.9782608695652174, + "eval_loss": 0.45189881324768066, + "eval_runtime": 0.843, + "eval_samples_per_second": 4.745, + "eval_steps_per_second": 1.186, + "step": 45 + }, + { + "epoch": 1.0, + "grad_norm": 0.797126829624176, + "learning_rate": 0.0, + "loss": 0.4124, + "step": 46 + }, + { + "epoch": 1.0, + "step": 46, + "total_flos": 7.03161217014825e+16, + "train_loss": 0.50457603516786, + "train_runtime": 362.0803, + "train_samples_per_second": 1.016, + "train_steps_per_second": 0.127 + } + ], + "logging_steps": 1, + "max_steps": 46, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 46, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.03161217014825e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..3e46226 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eae43ea012c8287b733903bf0f33856cc5c6ca5cda0bb5ef3c74282b6e3316e +size 7160 diff --git a/training_eval_loss.png b/training_eval_loss.png new file mode 100644 index 0000000..0490388 Binary files /dev/null and b/training_eval_loss.png differ diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..e3667b6 Binary files /dev/null and b/training_loss.png differ