commit 185a6c73efca0524ecc204d95310021200fda20b Author: ModelHub XC Date: Tue May 12 21:17:24 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: kevinpro/Vicuna-13B-CoT Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c7d9f33 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,34 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..b5d90f6 --- /dev/null +++ b/README.md @@ -0,0 +1,208 @@ +--- +datasets: +- QingyiSi/Alpaca-CoT +language: +- en +library_name: transformers +pipeline_tag: text-generation +tags: +- code +--- +# Model Card for Model ID +SFT to enhance the CoT capabiliy of Vicuna + +If you find the model helpful, please click "like" to support us. +We also welcome feedback on your usage experience and any issues you encounter in the issues section. + +Another 7B version: https://huggingface.co/kevinpro/Vicuna-7B-CoT + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + diff --git a/config.json b/config.json new file mode 100644 index 0000000..68942ac --- /dev/null +++ b/config.json @@ -0,0 +1,24 @@ +{ + "_name_or_path": "/mnt/data1/sheshuaijie/Code/Alpaca-CoT/mnt/data1/sheshuaijie/Output/CoT/Trained/vicuna-13b_english-cot+auto-cot_0.0002/merged", + "architectures": [ + "LlamaForCausalLM" + ], + "bos_token_id": 0, + "eos_token_id": 1, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 13824, + "max_position_embeddings": 2048, + "max_sequence_length": 2048, + "model_type": "llama", + "num_attention_heads": 40, + "num_hidden_layers": 40, + "pad_token_id": -1, + "rms_norm_eps": 1e-06, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.29.2", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..517f415 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.29.2" +} diff --git a/pytorch_model-00001-of-00006.bin b/pytorch_model-00001-of-00006.bin new file mode 100644 index 0000000..bf01295 --- /dev/null +++ b/pytorch_model-00001-of-00006.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5362a059f344e14022276b27e7bc0bc1a6bb14f6bf26afdabf526bffce54cfd0 +size 9956543883 diff --git a/pytorch_model-00002-of-00006.bin b/pytorch_model-00002-of-00006.bin new file mode 100644 index 0000000..f91e502 --- /dev/null +++ b/pytorch_model-00002-of-00006.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acea5d185b57e323f3b0d97ce81ec78fbc602b945a76230e80260c4fa8413d9b +size 9940856385 diff --git a/pytorch_model-00003-of-00006.bin b/pytorch_model-00003-of-00006.bin new file mode 100644 index 0000000..a3018b8 --- /dev/null +++ b/pytorch_model-00003-of-00006.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6a894e848279b57f9d7a14fcdca8e02c9f14bd39324a9a6ab6c708ddf8312a +size 9940856943 diff --git a/pytorch_model-00004-of-00006.bin b/pytorch_model-00004-of-00006.bin new file mode 100644 index 0000000..8d9f3ab --- /dev/null +++ b/pytorch_model-00004-of-00006.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12aade76bb9bb9e2c98e3ccdc1dcfd4571b57a15afc077a5bdfb13aea1c0598b +size 9867415289 diff --git a/pytorch_model-00005-of-00006.bin b/pytorch_model-00005-of-00006.bin new file mode 100644 index 0000000..18bde09 --- /dev/null +++ b/pytorch_model-00005-of-00006.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f984f23a73ebb6f10fc7b35afc9f87925d0a821dcf9f14eddf386738635a07e +size 9867456961 diff --git a/pytorch_model-00006-of-00006.bin b/pytorch_model-00006-of-00006.bin new file mode 100644 index 0000000..2af3fbc --- /dev/null +++ b/pytorch_model-00006-of-00006.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226eee221f360c205ed4918d5ba0705048c635aea978f056d5acaf6123dec0e3 +size 2490476207 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..4e32c74 --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,410 @@ +{ + "metadata": { + "total_size": 52063467520 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00006-of-00006.bin", + "model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00006.bin", + "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00006.bin", + "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00006.bin", + "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00006.bin", + "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00006.bin", + "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00006.bin", + "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00006.bin", + "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.15.input_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.15.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.15.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00006.bin", + "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.input_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00006.bin", + "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.input_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00006.bin", + "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00006.bin", + "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00006.bin", + "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00006.bin", + "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.20.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.20.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00006.bin", + "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00006.bin", + "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00006.bin", + "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin", + "model.layers.23.input_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.23.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.23.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00006.bin", + "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.input_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00006.bin", + "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.input_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00006.bin", + "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.input_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00006.bin", + "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.input_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00006.bin", + "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.input_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00006.bin", + "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.input_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00006.bin", + "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00006.bin", + "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.30.input_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.30.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.30.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00006.bin", + "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin", + "model.layers.31.input_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.31.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.31.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00006.bin", + "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.input_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.32.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00006.bin", + "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.input_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.33.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00006.bin", + "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.input_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.34.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00006.bin", + "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.input_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.35.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00006.bin", + "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.input_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.36.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00006.bin", + "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.input_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.37.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00006.bin", + "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.38.input_layernorm.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.38.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.38.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.38.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00006.bin", + "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin", + "model.layers.39.input_layernorm.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.39.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.39.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.39.self_attn.rotary_emb.inv_freq": "pytorch_model-00006-of-00006.bin", + "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00006-of-00006.bin", + "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00006.bin", + "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00006.bin", + "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00006.bin", + "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.7.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00006.bin", + "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin", + "model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.8.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00006.bin", + "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00006.bin", + "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin", + "model.norm.weight": "pytorch_model-00006-of-00006.bin" + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..6c00c74 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..400e3de --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,33 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "model_max_length": 1000000000000000019884624838656, + "pad_token": null, + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/adapter_config.json b/vicuna-13b_english-cot+auto-cot_0.0002/lora/adapter_config.json new file mode 100644 index 0000000..c5607df --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "/mnt/data1/sheshuaijie/Data/PLM/vicuna-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/adapter_model.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/adapter_model.bin new file mode 100644 index 0000000..8710093 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/adapter_config.json b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/adapter_config.json new file mode 100644 index 0000000..c5607df --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "/mnt/data1/sheshuaijie/Data/PLM/vicuna-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/adapter_model.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/adapter_model.bin new file mode 100644 index 0000000..8710093 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/optimizer.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/optimizer.pt new file mode 100644 index 0000000..ce97fb8 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1bd9db551f5053d250cdfad0dba63ab63f4d8770467f05a1c7eaa8ed882949 +size 209810181 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/pytorch_model.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/pytorch_model.bin new file mode 100644 index 0000000..5e6ffb8 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69b558ca414e2e4c64708ff75297722b21126c012ed94cb456e60ba088812b43 +size 104915277 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_0.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_0.pth new file mode 100644 index 0000000..0754936 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ef7df9d45b0ccca9c69f77e005511b666063ab1a038311314a04cfc65d660f7 +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_1.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_1.pth new file mode 100644 index 0000000..0cc3cc6 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6148b0540f53b460a67b06d568497039eab81579af3b1dcf86260cde4dfa7d0f +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_2.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_2.pth new file mode 100644 index 0000000..53f8a05 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9169c3e4560897db5cad83367bb48e655eb9c5072a53b2307ccd3fd4d5b3e6e1 +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_3.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_3.pth new file mode 100644 index 0000000..6a427e2 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9825b237af1085d3086638ea705364d338e4efd8605bccbcd424c38224ad9556 +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/scaler.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/scaler.pt new file mode 100644 index 0000000..6c9761d --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ceb2c0dcfaa71275aedf3b6024b198fa7f5f75980e818d6f48b3ec93fb208e4 +size 557 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/scheduler.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/scheduler.pt new file mode 100644 index 0000000..6ec9c5c --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:013d6a8410e7e8bcd20ee7e13d0ade381e59c981e59cf7840d165535290bf571 +size 627 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/trainer_state.json b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/trainer_state.json new file mode 100644 index 0000000..528e969 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/trainer_state.json @@ -0,0 +1,1506 @@ +{ + "best_metric": 0.5939880609512329, + "best_model_checkpoint": "/mnt/data1/sheshuaijie/Output/CoT/Trained/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036", + "epoch": 6.835463917525773, + "global_step": 1036, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "eval_loss": 1.7208857536315918, + "eval_runtime": 39.046, + "eval_samples_per_second": 76.832, + "eval_steps_per_second": 2.407, + "step": 7 + }, + { + "epoch": 0.09, + "eval_loss": 1.3302656412124634, + "eval_runtime": 39.1446, + "eval_samples_per_second": 76.639, + "eval_steps_per_second": 2.401, + "step": 14 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019933110367892977, + "loss": 1.607, + "step": 20 + }, + { + "epoch": 0.14, + "eval_loss": 1.0993696451187134, + "eval_runtime": 39.2624, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 21 + }, + { + "epoch": 0.18, + "eval_loss": 0.9883869886398315, + "eval_runtime": 39.2607, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 28 + }, + { + "epoch": 0.23, + "eval_loss": 0.9121341109275818, + "eval_runtime": 39.2818, + "eval_samples_per_second": 76.371, + "eval_steps_per_second": 2.393, + "step": 35 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019665551839464883, + "loss": 1.0077, + "step": 40 + }, + { + "epoch": 0.28, + "eval_loss": 0.8665392398834229, + "eval_runtime": 39.261, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 42 + }, + { + "epoch": 0.32, + "eval_loss": 0.8299428820610046, + "eval_runtime": 39.2723, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 49 + }, + { + "epoch": 0.37, + "eval_loss": 0.7965301275253296, + "eval_runtime": 39.2718, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 56 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001939799331103679, + "loss": 0.8626, + "step": 60 + }, + { + "epoch": 0.42, + "eval_loss": 0.7661889791488647, + "eval_runtime": 39.2752, + "eval_samples_per_second": 76.384, + "eval_steps_per_second": 2.393, + "step": 63 + }, + { + "epoch": 0.46, + "eval_loss": 0.744417130947113, + "eval_runtime": 39.2899, + "eval_samples_per_second": 76.355, + "eval_steps_per_second": 2.392, + "step": 70 + }, + { + "epoch": 0.51, + "eval_loss": 0.728394627571106, + "eval_runtime": 39.298, + "eval_samples_per_second": 76.34, + "eval_steps_per_second": 2.392, + "step": 77 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019130434782608697, + "loss": 0.7683, + "step": 80 + }, + { + "epoch": 0.55, + "eval_loss": 0.7151542901992798, + "eval_runtime": 39.272, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 84 + }, + { + "epoch": 0.6, + "eval_loss": 0.7049417495727539, + "eval_runtime": 39.2657, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 91 + }, + { + "epoch": 0.65, + "eval_loss": 0.6961150765419006, + "eval_runtime": 39.2274, + "eval_samples_per_second": 76.477, + "eval_steps_per_second": 2.396, + "step": 98 + }, + { + "epoch": 0.66, + "learning_rate": 0.00018862876254180605, + "loss": 0.7346, + "step": 100 + }, + { + "epoch": 0.69, + "eval_loss": 0.6891586780548096, + "eval_runtime": 39.2698, + "eval_samples_per_second": 76.395, + "eval_steps_per_second": 2.394, + "step": 105 + }, + { + "epoch": 0.74, + "eval_loss": 0.6833620667457581, + "eval_runtime": 39.2474, + "eval_samples_per_second": 76.438, + "eval_steps_per_second": 2.395, + "step": 112 + }, + { + "epoch": 0.79, + "eval_loss": 0.678981602191925, + "eval_runtime": 39.2363, + "eval_samples_per_second": 76.46, + "eval_steps_per_second": 2.396, + "step": 119 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001859531772575251, + "loss": 0.7095, + "step": 120 + }, + { + "epoch": 0.83, + "eval_loss": 0.6739740967750549, + "eval_runtime": 39.2467, + "eval_samples_per_second": 76.439, + "eval_steps_per_second": 2.395, + "step": 126 + }, + { + "epoch": 0.88, + "eval_loss": 0.6704814434051514, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 133 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018327759197324413, + "loss": 0.6989, + "step": 140 + }, + { + "epoch": 0.92, + "eval_loss": 0.6668062210083008, + "eval_runtime": 39.1861, + "eval_samples_per_second": 76.558, + "eval_steps_per_second": 2.399, + "step": 140 + }, + { + "epoch": 0.97, + "eval_loss": 0.6635003089904785, + "eval_runtime": 39.2627, + "eval_samples_per_second": 76.408, + "eval_steps_per_second": 2.394, + "step": 147 + }, + { + "epoch": 1.02, + "eval_loss": 0.6594184637069702, + "eval_runtime": 39.2634, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 154 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018060200668896322, + "loss": 0.6753, + "step": 160 + }, + { + "epoch": 1.06, + "eval_loss": 0.656818151473999, + "eval_runtime": 39.2093, + "eval_samples_per_second": 76.513, + "eval_steps_per_second": 2.397, + "step": 161 + }, + { + "epoch": 1.11, + "eval_loss": 0.6542237401008606, + "eval_runtime": 39.2619, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 168 + }, + { + "epoch": 1.15, + "eval_loss": 0.6509793400764465, + "eval_runtime": 39.2795, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 175 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017792642140468227, + "loss": 0.6742, + "step": 180 + }, + { + "epoch": 1.2, + "eval_loss": 0.6501123905181885, + "eval_runtime": 39.2889, + "eval_samples_per_second": 76.357, + "eval_steps_per_second": 2.393, + "step": 182 + }, + { + "epoch": 1.25, + "eval_loss": 0.6488311290740967, + "eval_runtime": 39.2821, + "eval_samples_per_second": 76.371, + "eval_steps_per_second": 2.393, + "step": 189 + }, + { + "epoch": 1.29, + "eval_loss": 0.6458473205566406, + "eval_runtime": 39.2749, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 196 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017525083612040135, + "loss": 0.6727, + "step": 200 + }, + { + "epoch": 1.34, + "eval_loss": 0.6445983648300171, + "eval_runtime": 39.2655, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 203 + }, + { + "epoch": 1.39, + "eval_loss": 0.6414983868598938, + "eval_runtime": 39.2575, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 210 + }, + { + "epoch": 1.43, + "eval_loss": 0.6403743624687195, + "eval_runtime": 39.2601, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 217 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001725752508361204, + "loss": 0.6651, + "step": 220 + }, + { + "epoch": 1.48, + "eval_loss": 0.6375772953033447, + "eval_runtime": 39.2616, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 224 + }, + { + "epoch": 1.52, + "eval_loss": 0.6363030076026917, + "eval_runtime": 39.2685, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 231 + }, + { + "epoch": 1.57, + "eval_loss": 0.6365154981613159, + "eval_runtime": 39.2615, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 238 + }, + { + "epoch": 1.58, + "learning_rate": 0.00016989966555183946, + "loss": 0.6569, + "step": 240 + }, + { + "epoch": 1.62, + "eval_loss": 0.6351213455200195, + "eval_runtime": 39.2374, + "eval_samples_per_second": 76.458, + "eval_steps_per_second": 2.396, + "step": 245 + }, + { + "epoch": 1.66, + "eval_loss": 0.633696436882019, + "eval_runtime": 39.2576, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 252 + }, + { + "epoch": 1.71, + "eval_loss": 0.6320024132728577, + "eval_runtime": 39.2456, + "eval_samples_per_second": 76.442, + "eval_steps_per_second": 2.395, + "step": 259 + }, + { + "epoch": 1.72, + "learning_rate": 0.00016722408026755855, + "loss": 0.6535, + "step": 260 + }, + { + "epoch": 1.76, + "eval_loss": 0.6302981972694397, + "eval_runtime": 39.2723, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 266 + }, + { + "epoch": 1.8, + "eval_loss": 0.6285908818244934, + "eval_runtime": 39.2745, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 273 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001645484949832776, + "loss": 0.6504, + "step": 280 + }, + { + "epoch": 1.85, + "eval_loss": 0.6279519200325012, + "eval_runtime": 39.1978, + "eval_samples_per_second": 76.535, + "eval_steps_per_second": 2.398, + "step": 280 + }, + { + "epoch": 1.89, + "eval_loss": 0.6275761723518372, + "eval_runtime": 39.2574, + "eval_samples_per_second": 76.419, + "eval_steps_per_second": 2.394, + "step": 287 + }, + { + "epoch": 1.94, + "eval_loss": 0.6262693405151367, + "eval_runtime": 39.2587, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 294 + }, + { + "epoch": 1.98, + "learning_rate": 0.00016187290969899666, + "loss": 0.6447, + "step": 300 + }, + { + "epoch": 1.99, + "eval_loss": 0.6255723237991333, + "eval_runtime": 39.2593, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 301 + }, + { + "epoch": 2.03, + "eval_loss": 0.624893307685852, + "eval_runtime": 39.2732, + "eval_samples_per_second": 76.388, + "eval_steps_per_second": 2.393, + "step": 308 + }, + { + "epoch": 2.08, + "eval_loss": 0.6238787174224854, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 315 + }, + { + "epoch": 2.11, + "learning_rate": 0.00015919732441471574, + "loss": 0.6418, + "step": 320 + }, + { + "epoch": 2.12, + "eval_loss": 0.6227560043334961, + "eval_runtime": 39.2517, + "eval_samples_per_second": 76.43, + "eval_steps_per_second": 2.395, + "step": 322 + }, + { + "epoch": 2.17, + "eval_loss": 0.621408998966217, + "eval_runtime": 39.2673, + "eval_samples_per_second": 76.4, + "eval_steps_per_second": 2.394, + "step": 329 + }, + { + "epoch": 2.22, + "eval_loss": 0.6207154989242554, + "eval_runtime": 39.2707, + "eval_samples_per_second": 76.393, + "eval_steps_per_second": 2.394, + "step": 336 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001565217391304348, + "loss": 0.6294, + "step": 340 + }, + { + "epoch": 2.26, + "eval_loss": 0.6207785606384277, + "eval_runtime": 39.2687, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 343 + }, + { + "epoch": 2.31, + "eval_loss": 0.619699239730835, + "eval_runtime": 39.2528, + "eval_samples_per_second": 76.428, + "eval_steps_per_second": 2.395, + "step": 350 + }, + { + "epoch": 2.36, + "eval_loss": 0.6189907789230347, + "eval_runtime": 39.2383, + "eval_samples_per_second": 76.456, + "eval_steps_per_second": 2.396, + "step": 357 + }, + { + "epoch": 2.38, + "learning_rate": 0.00015384615384615385, + "loss": 0.6323, + "step": 360 + }, + { + "epoch": 2.4, + "eval_loss": 0.6188793182373047, + "eval_runtime": 39.2507, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 364 + }, + { + "epoch": 2.45, + "eval_loss": 0.6180170774459839, + "eval_runtime": 39.2506, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 371 + }, + { + "epoch": 2.49, + "eval_loss": 0.6175986528396606, + "eval_runtime": 39.2493, + "eval_samples_per_second": 76.434, + "eval_steps_per_second": 2.395, + "step": 378 + }, + { + "epoch": 2.51, + "learning_rate": 0.00015117056856187293, + "loss": 0.6194, + "step": 380 + }, + { + "epoch": 2.54, + "eval_loss": 0.6155608296394348, + "eval_runtime": 39.2549, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 385 + }, + { + "epoch": 2.59, + "eval_loss": 0.6149768829345703, + "eval_runtime": 39.2507, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 392 + }, + { + "epoch": 2.63, + "eval_loss": 0.614321768283844, + "eval_runtime": 39.2607, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 399 + }, + { + "epoch": 2.64, + "learning_rate": 0.00014849498327759196, + "loss": 0.6165, + "step": 400 + }, + { + "epoch": 2.68, + "eval_loss": 0.6136913299560547, + "eval_runtime": 39.256, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 406 + }, + { + "epoch": 2.72, + "eval_loss": 0.6127980351448059, + "eval_runtime": 39.2695, + "eval_samples_per_second": 76.395, + "eval_steps_per_second": 2.394, + "step": 413 + }, + { + "epoch": 2.77, + "learning_rate": 0.00014581939799331104, + "loss": 0.6202, + "step": 420 + }, + { + "epoch": 2.77, + "eval_loss": 0.6126558780670166, + "eval_runtime": 39.2344, + "eval_samples_per_second": 76.464, + "eval_steps_per_second": 2.396, + "step": 420 + }, + { + "epoch": 2.82, + "eval_loss": 0.6126319766044617, + "eval_runtime": 39.2692, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 427 + }, + { + "epoch": 2.86, + "eval_loss": 0.6124591827392578, + "eval_runtime": 39.2771, + "eval_samples_per_second": 76.38, + "eval_steps_per_second": 2.393, + "step": 434 + }, + { + "epoch": 2.9, + "learning_rate": 0.0001431438127090301, + "loss": 0.6186, + "step": 440 + }, + { + "epoch": 2.91, + "eval_loss": 0.6117784976959229, + "eval_runtime": 39.2494, + "eval_samples_per_second": 76.434, + "eval_steps_per_second": 2.395, + "step": 441 + }, + { + "epoch": 2.96, + "eval_loss": 0.6105948090553284, + "eval_runtime": 39.2716, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 448 + }, + { + "epoch": 3.0, + "eval_loss": 0.6107361912727356, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 455 + }, + { + "epoch": 3.04, + "learning_rate": 0.00014046822742474916, + "loss": 0.6165, + "step": 460 + }, + { + "epoch": 3.05, + "eval_loss": 0.6106633543968201, + "eval_runtime": 39.2701, + "eval_samples_per_second": 76.394, + "eval_steps_per_second": 2.394, + "step": 462 + }, + { + "epoch": 3.09, + "eval_loss": 0.6104211807250977, + "eval_runtime": 39.2794, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 469 + }, + { + "epoch": 3.14, + "eval_loss": 0.611173152923584, + "eval_runtime": 39.2596, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 476 + }, + { + "epoch": 3.17, + "learning_rate": 0.00013779264214046824, + "loss": 0.6021, + "step": 480 + }, + { + "epoch": 3.19, + "eval_loss": 0.6094884276390076, + "eval_runtime": 39.2429, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 2.395, + "step": 483 + }, + { + "epoch": 3.23, + "eval_loss": 0.6093204617500305, + "eval_runtime": 39.278, + "eval_samples_per_second": 76.379, + "eval_steps_per_second": 2.393, + "step": 490 + }, + { + "epoch": 3.28, + "eval_loss": 0.60869961977005, + "eval_runtime": 39.269, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 497 + }, + { + "epoch": 3.3, + "learning_rate": 0.0001351170568561873, + "loss": 0.6057, + "step": 500 + }, + { + "epoch": 3.33, + "eval_loss": 0.6093556880950928, + "eval_runtime": 39.2597, + "eval_samples_per_second": 76.414, + "eval_steps_per_second": 2.394, + "step": 504 + }, + { + "epoch": 3.37, + "eval_loss": 0.6078519821166992, + "eval_runtime": 39.2561, + "eval_samples_per_second": 76.421, + "eval_steps_per_second": 2.395, + "step": 511 + }, + { + "epoch": 3.42, + "eval_loss": 0.6079010367393494, + "eval_runtime": 39.2536, + "eval_samples_per_second": 76.426, + "eval_steps_per_second": 2.395, + "step": 518 + }, + { + "epoch": 3.43, + "learning_rate": 0.00013244147157190635, + "loss": 0.598, + "step": 520 + }, + { + "epoch": 3.46, + "eval_loss": 0.6074483394622803, + "eval_runtime": 39.2832, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 525 + }, + { + "epoch": 3.51, + "eval_loss": 0.6073596477508545, + "eval_runtime": 39.2701, + "eval_samples_per_second": 76.394, + "eval_steps_per_second": 2.394, + "step": 532 + }, + { + "epoch": 3.56, + "eval_loss": 0.606430172920227, + "eval_runtime": 39.2546, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 539 + }, + { + "epoch": 3.56, + "learning_rate": 0.00012976588628762543, + "loss": 0.5948, + "step": 540 + }, + { + "epoch": 3.6, + "eval_loss": 0.6060574650764465, + "eval_runtime": 39.2503, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 546 + }, + { + "epoch": 3.65, + "eval_loss": 0.6067923307418823, + "eval_runtime": 39.2654, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 553 + }, + { + "epoch": 3.69, + "learning_rate": 0.0001270903010033445, + "loss": 0.5962, + "step": 560 + }, + { + "epoch": 3.69, + "eval_loss": 0.6042212843894958, + "eval_runtime": 39.2032, + "eval_samples_per_second": 76.524, + "eval_steps_per_second": 2.398, + "step": 560 + }, + { + "epoch": 3.74, + "eval_loss": 0.6041299700737, + "eval_runtime": 39.2396, + "eval_samples_per_second": 76.453, + "eval_steps_per_second": 2.396, + "step": 567 + }, + { + "epoch": 3.79, + "eval_loss": 0.6047356128692627, + "eval_runtime": 39.274, + "eval_samples_per_second": 76.386, + "eval_steps_per_second": 2.393, + "step": 574 + }, + { + "epoch": 3.83, + "learning_rate": 0.00012441471571906357, + "loss": 0.5977, + "step": 580 + }, + { + "epoch": 3.83, + "eval_loss": 0.6040154099464417, + "eval_runtime": 39.2677, + "eval_samples_per_second": 76.399, + "eval_steps_per_second": 2.394, + "step": 581 + }, + { + "epoch": 3.88, + "eval_loss": 0.603416383266449, + "eval_runtime": 39.2621, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 588 + }, + { + "epoch": 3.93, + "eval_loss": 0.6036480069160461, + "eval_runtime": 39.2609, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 595 + }, + { + "epoch": 3.96, + "learning_rate": 0.00012173913043478263, + "loss": 0.5903, + "step": 600 + }, + { + "epoch": 3.97, + "eval_loss": 0.6035267114639282, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 602 + }, + { + "epoch": 4.02, + "eval_loss": 0.6025964617729187, + "eval_runtime": 39.2634, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 609 + }, + { + "epoch": 4.06, + "eval_loss": 0.6028868556022644, + "eval_runtime": 39.2591, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 616 + }, + { + "epoch": 4.09, + "learning_rate": 0.0001190635451505017, + "loss": 0.5927, + "step": 620 + }, + { + "epoch": 4.11, + "eval_loss": 0.6027114391326904, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 623 + }, + { + "epoch": 4.16, + "eval_loss": 0.6030986905097961, + "eval_runtime": 39.2746, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 630 + }, + { + "epoch": 4.2, + "eval_loss": 0.6026434898376465, + "eval_runtime": 39.2646, + "eval_samples_per_second": 76.405, + "eval_steps_per_second": 2.394, + "step": 637 + }, + { + "epoch": 4.22, + "learning_rate": 0.00011638795986622074, + "loss": 0.581, + "step": 640 + }, + { + "epoch": 4.25, + "eval_loss": 0.6008206009864807, + "eval_runtime": 39.2718, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 644 + }, + { + "epoch": 4.3, + "eval_loss": 0.6018855571746826, + "eval_runtime": 39.2587, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 651 + }, + { + "epoch": 4.34, + "eval_loss": 0.6018174886703491, + "eval_runtime": 39.2445, + "eval_samples_per_second": 76.444, + "eval_steps_per_second": 2.395, + "step": 658 + }, + { + "epoch": 4.35, + "learning_rate": 0.00011371237458193979, + "loss": 0.5965, + "step": 660 + }, + { + "epoch": 4.39, + "eval_loss": 0.6006762981414795, + "eval_runtime": 39.2498, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 665 + }, + { + "epoch": 4.43, + "eval_loss": 0.6006374359130859, + "eval_runtime": 39.2758, + "eval_samples_per_second": 76.383, + "eval_steps_per_second": 2.393, + "step": 672 + }, + { + "epoch": 4.48, + "eval_loss": 0.5997828245162964, + "eval_runtime": 39.2794, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 679 + }, + { + "epoch": 4.49, + "learning_rate": 0.00011103678929765886, + "loss": 0.5896, + "step": 680 + }, + { + "epoch": 4.53, + "eval_loss": 0.6000981330871582, + "eval_runtime": 39.2629, + "eval_samples_per_second": 76.408, + "eval_steps_per_second": 2.394, + "step": 686 + }, + { + "epoch": 4.57, + "eval_loss": 0.5991115570068359, + "eval_runtime": 39.2774, + "eval_samples_per_second": 76.38, + "eval_steps_per_second": 2.393, + "step": 693 + }, + { + "epoch": 4.62, + "learning_rate": 0.00010836120401337793, + "loss": 0.5854, + "step": 700 + }, + { + "epoch": 4.62, + "eval_loss": 0.6001954674720764, + "eval_runtime": 39.2333, + "eval_samples_per_second": 76.466, + "eval_steps_per_second": 2.396, + "step": 700 + }, + { + "epoch": 4.66, + "eval_loss": 0.6007575988769531, + "eval_runtime": 39.2801, + "eval_samples_per_second": 76.374, + "eval_steps_per_second": 2.393, + "step": 707 + }, + { + "epoch": 4.71, + "eval_loss": 0.5983864068984985, + "eval_runtime": 39.2469, + "eval_samples_per_second": 76.439, + "eval_steps_per_second": 2.395, + "step": 714 + }, + { + "epoch": 4.75, + "learning_rate": 0.00010568561872909698, + "loss": 0.5844, + "step": 720 + }, + { + "epoch": 4.76, + "eval_loss": 0.5985506772994995, + "eval_runtime": 39.2426, + "eval_samples_per_second": 76.448, + "eval_steps_per_second": 2.395, + "step": 721 + }, + { + "epoch": 4.8, + "eval_loss": 0.5978309512138367, + "eval_runtime": 39.2604, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 728 + }, + { + "epoch": 4.85, + "eval_loss": 0.5981310606002808, + "eval_runtime": 39.2686, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 735 + }, + { + "epoch": 4.88, + "learning_rate": 0.00010301003344481605, + "loss": 0.5784, + "step": 740 + }, + { + "epoch": 4.9, + "eval_loss": 0.5985335111618042, + "eval_runtime": 39.2557, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 742 + }, + { + "epoch": 4.94, + "eval_loss": 0.5975944995880127, + "eval_runtime": 39.2644, + "eval_samples_per_second": 76.405, + "eval_steps_per_second": 2.394, + "step": 749 + }, + { + "epoch": 4.99, + "eval_loss": 0.596754252910614, + "eval_runtime": 39.2365, + "eval_samples_per_second": 76.459, + "eval_steps_per_second": 2.396, + "step": 756 + }, + { + "epoch": 5.01, + "learning_rate": 0.00010033444816053512, + "loss": 0.5825, + "step": 760 + }, + { + "epoch": 5.03, + "eval_loss": 0.5977214574813843, + "eval_runtime": 39.235, + "eval_samples_per_second": 76.462, + "eval_steps_per_second": 2.396, + "step": 763 + }, + { + "epoch": 5.08, + "eval_loss": 0.5982287526130676, + "eval_runtime": 39.2483, + "eval_samples_per_second": 76.436, + "eval_steps_per_second": 2.395, + "step": 770 + }, + { + "epoch": 5.13, + "eval_loss": 0.5973477959632874, + "eval_runtime": 39.2692, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 777 + }, + { + "epoch": 5.15, + "learning_rate": 9.765886287625419e-05, + "loss": 0.5724, + "step": 780 + }, + { + "epoch": 5.17, + "eval_loss": 0.598833441734314, + "eval_runtime": 39.2608, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 784 + }, + { + "epoch": 5.22, + "eval_loss": 0.5973609089851379, + "eval_runtime": 39.2557, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 791 + }, + { + "epoch": 5.27, + "eval_loss": 0.5983055233955383, + "eval_runtime": 39.2613, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 798 + }, + { + "epoch": 5.28, + "learning_rate": 9.498327759197325e-05, + "loss": 0.5765, + "step": 800 + }, + { + "epoch": 5.31, + "eval_loss": 0.597219705581665, + "eval_runtime": 39.2532, + "eval_samples_per_second": 76.427, + "eval_steps_per_second": 2.395, + "step": 805 + }, + { + "epoch": 5.36, + "eval_loss": 0.5974920392036438, + "eval_runtime": 39.2428, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 2.395, + "step": 812 + }, + { + "epoch": 5.4, + "eval_loss": 0.5970295667648315, + "eval_runtime": 39.2255, + "eval_samples_per_second": 76.481, + "eval_steps_per_second": 2.396, + "step": 819 + }, + { + "epoch": 5.41, + "learning_rate": 9.230769230769232e-05, + "loss": 0.5662, + "step": 820 + }, + { + "epoch": 5.45, + "eval_loss": 0.5995200872421265, + "eval_runtime": 39.2763, + "eval_samples_per_second": 76.382, + "eval_steps_per_second": 2.393, + "step": 826 + }, + { + "epoch": 5.5, + "eval_loss": 0.5961365699768066, + "eval_runtime": 39.2442, + "eval_samples_per_second": 76.444, + "eval_steps_per_second": 2.395, + "step": 833 + }, + { + "epoch": 5.54, + "learning_rate": 8.963210702341137e-05, + "loss": 0.5594, + "step": 840 + }, + { + "epoch": 5.54, + "eval_loss": 0.5958811640739441, + "eval_runtime": 39.224, + "eval_samples_per_second": 76.484, + "eval_steps_per_second": 2.396, + "step": 840 + }, + { + "epoch": 5.59, + "eval_loss": 0.5974062085151672, + "eval_runtime": 39.2479, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 847 + }, + { + "epoch": 5.63, + "eval_loss": 0.5959305167198181, + "eval_runtime": 39.1122, + "eval_samples_per_second": 76.702, + "eval_steps_per_second": 2.403, + "step": 854 + }, + { + "epoch": 5.67, + "learning_rate": 8.695652173913044e-05, + "loss": 0.5569, + "step": 860 + }, + { + "epoch": 5.68, + "eval_loss": 0.597082257270813, + "eval_runtime": 39.2419, + "eval_samples_per_second": 76.449, + "eval_steps_per_second": 2.395, + "step": 861 + }, + { + "epoch": 5.73, + "eval_loss": 0.5964935421943665, + "eval_runtime": 39.2482, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 868 + }, + { + "epoch": 5.77, + "eval_loss": 0.596628725528717, + "eval_runtime": 39.2684, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 875 + }, + { + "epoch": 5.81, + "learning_rate": 8.42809364548495e-05, + "loss": 0.5711, + "step": 880 + }, + { + "epoch": 5.82, + "eval_loss": 0.596688449382782, + "eval_runtime": 39.262, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 882 + }, + { + "epoch": 5.87, + "eval_loss": 0.5974501967430115, + "eval_runtime": 39.2621, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 889 + }, + { + "epoch": 5.91, + "eval_loss": 0.5951861143112183, + "eval_runtime": 39.2622, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 896 + }, + { + "epoch": 5.94, + "learning_rate": 8.160535117056857e-05, + "loss": 0.5703, + "step": 900 + }, + { + "epoch": 5.96, + "eval_loss": 0.5963322520256042, + "eval_runtime": 39.2656, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 903 + }, + { + "epoch": 6.0, + "eval_loss": 0.5958115458488464, + "eval_runtime": 39.2804, + "eval_samples_per_second": 76.374, + "eval_steps_per_second": 2.393, + "step": 910 + }, + { + "epoch": 6.05, + "eval_loss": 0.5968443155288696, + "eval_runtime": 39.2618, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 917 + }, + { + "epoch": 6.07, + "learning_rate": 7.892976588628763e-05, + "loss": 0.5551, + "step": 920 + }, + { + "epoch": 6.1, + "eval_loss": 0.5958288311958313, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 924 + }, + { + "epoch": 6.14, + "eval_loss": 0.5968209505081177, + "eval_runtime": 39.2563, + "eval_samples_per_second": 76.421, + "eval_steps_per_second": 2.395, + "step": 931 + }, + { + "epoch": 6.19, + "eval_loss": 0.5957658886909485, + "eval_runtime": 39.2499, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 938 + }, + { + "epoch": 6.2, + "learning_rate": 7.62541806020067e-05, + "loss": 0.5636, + "step": 940 + }, + { + "epoch": 6.24, + "eval_loss": 0.5955784916877747, + "eval_runtime": 39.279, + "eval_samples_per_second": 76.377, + "eval_steps_per_second": 2.393, + "step": 945 + }, + { + "epoch": 6.28, + "eval_loss": 0.5963084101676941, + "eval_runtime": 39.2656, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 952 + }, + { + "epoch": 6.33, + "eval_loss": 0.595792829990387, + "eval_runtime": 39.2577, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 959 + }, + { + "epoch": 6.33, + "learning_rate": 7.357859531772575e-05, + "loss": 0.5676, + "step": 960 + }, + { + "epoch": 6.37, + "eval_loss": 0.5953949093818665, + "eval_runtime": 39.2554, + "eval_samples_per_second": 76.423, + "eval_steps_per_second": 2.395, + "step": 966 + }, + { + "epoch": 6.42, + "eval_loss": 0.595146894454956, + "eval_runtime": 39.2386, + "eval_samples_per_second": 76.455, + "eval_steps_per_second": 2.396, + "step": 973 + }, + { + "epoch": 6.47, + "learning_rate": 7.090301003344481e-05, + "loss": 0.5551, + "step": 980 + }, + { + "epoch": 6.47, + "eval_loss": 0.5957517027854919, + "eval_runtime": 39.197, + "eval_samples_per_second": 76.536, + "eval_steps_per_second": 2.398, + "step": 980 + }, + { + "epoch": 6.51, + "eval_loss": 0.596603512763977, + "eval_runtime": 39.2315, + "eval_samples_per_second": 76.469, + "eval_steps_per_second": 2.396, + "step": 987 + }, + { + "epoch": 6.56, + "eval_loss": 0.5952173471450806, + "eval_runtime": 39.2393, + "eval_samples_per_second": 76.454, + "eval_steps_per_second": 2.396, + "step": 994 + }, + { + "epoch": 6.6, + "learning_rate": 6.822742474916388e-05, + "loss": 0.5539, + "step": 1000 + }, + { + "epoch": 6.6, + "eval_loss": 0.5954132676124573, + "eval_runtime": 39.2213, + "eval_samples_per_second": 76.489, + "eval_steps_per_second": 2.397, + "step": 1001 + }, + { + "epoch": 6.65, + "eval_loss": 0.5956953167915344, + "eval_runtime": 39.2503, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 1008 + }, + { + "epoch": 6.7, + "eval_loss": 0.5959665775299072, + "eval_runtime": 39.2657, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 1015 + }, + { + "epoch": 6.73, + "learning_rate": 6.555183946488295e-05, + "loss": 0.5607, + "step": 1020 + }, + { + "epoch": 6.74, + "eval_loss": 0.5952425003051758, + "eval_runtime": 39.2705, + "eval_samples_per_second": 76.393, + "eval_steps_per_second": 2.394, + "step": 1022 + }, + { + "epoch": 6.79, + "eval_loss": 0.5953785181045532, + "eval_runtime": 39.2403, + "eval_samples_per_second": 76.452, + "eval_steps_per_second": 2.395, + "step": 1029 + }, + { + "epoch": 6.84, + "eval_loss": 0.5939880609512329, + "eval_runtime": 39.2586, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 1036 + } + ], + "max_steps": 1510, + "num_train_epochs": 10, + "total_flos": 9.026762224110141e+18, + "trial_name": null, + "trial_params": null +} diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/training_args.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/training_args.bin new file mode 100644 index 0000000..ecaa6ea --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc36676b8e75725a5f1d435df06d955098ecfd2dcf5fc632f668dfc3b7a43333 +size 4027 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/adapter_config.json b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/adapter_config.json new file mode 100644 index 0000000..c5607df --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "/mnt/data1/sheshuaijie/Data/PLM/vicuna-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/adapter_model.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/adapter_model.bin new file mode 100644 index 0000000..8710093 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/optimizer.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/optimizer.pt new file mode 100644 index 0000000..9379c74 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f004630f45896ece26280742f15b28aa38383360da760ef92b5c911f2c3f9aa3 +size 209810181 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/pytorch_model.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/pytorch_model.bin new file mode 100644 index 0000000..c7a99ad --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bde1cee0b6d13acb7f6e4f0570a720103f02bfc85709715a1476e8ab898d34de +size 104915277 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_0.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_0.pth new file mode 100644 index 0000000..c02ef18 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afe1da91613c11605b6b4152085eca4017e91869c6313d6573a590da7d74380c +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_1.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_1.pth new file mode 100644 index 0000000..33f23b6 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d484d85179931750298e99a718675725e5543bae46ad569434867192e34748d5 +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_2.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_2.pth new file mode 100644 index 0000000..f793390 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:676f6e638b772a449173fd3339abbd5bf726e99dc2bd16fcdc317cf65a8c2fd7 +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_3.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_3.pth new file mode 100644 index 0000000..5b78422 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:962120d774cd2704683c32143d6c74751313598654f6ce29854b9bd48267b5bd +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/scaler.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/scaler.pt new file mode 100644 index 0000000..187436b --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ef2b11a4a54f62b6adba79797208808147dd3f66e3935d1a14649ef17b698f2 +size 557 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/scheduler.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/scheduler.pt new file mode 100644 index 0000000..4c3d11a --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e76f15f7c4cc04aa6f245833e60b6a09a99f2e2af4927ea728c672ab303892d +size 627 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/trainer_state.json b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/trainer_state.json new file mode 100644 index 0000000..e22f377 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/trainer_state.json @@ -0,0 +1,2172 @@ +{ + "best_metric": 0.5939880609512329, + "best_model_checkpoint": "/mnt/data1/sheshuaijie/Output/CoT/Trained/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036", + "epoch": 9.883711340206185, + "global_step": 1498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "eval_loss": 1.7208857536315918, + "eval_runtime": 39.046, + "eval_samples_per_second": 76.832, + "eval_steps_per_second": 2.407, + "step": 7 + }, + { + "epoch": 0.09, + "eval_loss": 1.3302656412124634, + "eval_runtime": 39.1446, + "eval_samples_per_second": 76.639, + "eval_steps_per_second": 2.401, + "step": 14 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019933110367892977, + "loss": 1.607, + "step": 20 + }, + { + "epoch": 0.14, + "eval_loss": 1.0993696451187134, + "eval_runtime": 39.2624, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 21 + }, + { + "epoch": 0.18, + "eval_loss": 0.9883869886398315, + "eval_runtime": 39.2607, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 28 + }, + { + "epoch": 0.23, + "eval_loss": 0.9121341109275818, + "eval_runtime": 39.2818, + "eval_samples_per_second": 76.371, + "eval_steps_per_second": 2.393, + "step": 35 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019665551839464883, + "loss": 1.0077, + "step": 40 + }, + { + "epoch": 0.28, + "eval_loss": 0.8665392398834229, + "eval_runtime": 39.261, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 42 + }, + { + "epoch": 0.32, + "eval_loss": 0.8299428820610046, + "eval_runtime": 39.2723, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 49 + }, + { + "epoch": 0.37, + "eval_loss": 0.7965301275253296, + "eval_runtime": 39.2718, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 56 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001939799331103679, + "loss": 0.8626, + "step": 60 + }, + { + "epoch": 0.42, + "eval_loss": 0.7661889791488647, + "eval_runtime": 39.2752, + "eval_samples_per_second": 76.384, + "eval_steps_per_second": 2.393, + "step": 63 + }, + { + "epoch": 0.46, + "eval_loss": 0.744417130947113, + "eval_runtime": 39.2899, + "eval_samples_per_second": 76.355, + "eval_steps_per_second": 2.392, + "step": 70 + }, + { + "epoch": 0.51, + "eval_loss": 0.728394627571106, + "eval_runtime": 39.298, + "eval_samples_per_second": 76.34, + "eval_steps_per_second": 2.392, + "step": 77 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019130434782608697, + "loss": 0.7683, + "step": 80 + }, + { + "epoch": 0.55, + "eval_loss": 0.7151542901992798, + "eval_runtime": 39.272, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 84 + }, + { + "epoch": 0.6, + "eval_loss": 0.7049417495727539, + "eval_runtime": 39.2657, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 91 + }, + { + "epoch": 0.65, + "eval_loss": 0.6961150765419006, + "eval_runtime": 39.2274, + "eval_samples_per_second": 76.477, + "eval_steps_per_second": 2.396, + "step": 98 + }, + { + "epoch": 0.66, + "learning_rate": 0.00018862876254180605, + "loss": 0.7346, + "step": 100 + }, + { + "epoch": 0.69, + "eval_loss": 0.6891586780548096, + "eval_runtime": 39.2698, + "eval_samples_per_second": 76.395, + "eval_steps_per_second": 2.394, + "step": 105 + }, + { + "epoch": 0.74, + "eval_loss": 0.6833620667457581, + "eval_runtime": 39.2474, + "eval_samples_per_second": 76.438, + "eval_steps_per_second": 2.395, + "step": 112 + }, + { + "epoch": 0.79, + "eval_loss": 0.678981602191925, + "eval_runtime": 39.2363, + "eval_samples_per_second": 76.46, + "eval_steps_per_second": 2.396, + "step": 119 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001859531772575251, + "loss": 0.7095, + "step": 120 + }, + { + "epoch": 0.83, + "eval_loss": 0.6739740967750549, + "eval_runtime": 39.2467, + "eval_samples_per_second": 76.439, + "eval_steps_per_second": 2.395, + "step": 126 + }, + { + "epoch": 0.88, + "eval_loss": 0.6704814434051514, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 133 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018327759197324413, + "loss": 0.6989, + "step": 140 + }, + { + "epoch": 0.92, + "eval_loss": 0.6668062210083008, + "eval_runtime": 39.1861, + "eval_samples_per_second": 76.558, + "eval_steps_per_second": 2.399, + "step": 140 + }, + { + "epoch": 0.97, + "eval_loss": 0.6635003089904785, + "eval_runtime": 39.2627, + "eval_samples_per_second": 76.408, + "eval_steps_per_second": 2.394, + "step": 147 + }, + { + "epoch": 1.02, + "eval_loss": 0.6594184637069702, + "eval_runtime": 39.2634, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 154 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018060200668896322, + "loss": 0.6753, + "step": 160 + }, + { + "epoch": 1.06, + "eval_loss": 0.656818151473999, + "eval_runtime": 39.2093, + "eval_samples_per_second": 76.513, + "eval_steps_per_second": 2.397, + "step": 161 + }, + { + "epoch": 1.11, + "eval_loss": 0.6542237401008606, + "eval_runtime": 39.2619, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 168 + }, + { + "epoch": 1.15, + "eval_loss": 0.6509793400764465, + "eval_runtime": 39.2795, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 175 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017792642140468227, + "loss": 0.6742, + "step": 180 + }, + { + "epoch": 1.2, + "eval_loss": 0.6501123905181885, + "eval_runtime": 39.2889, + "eval_samples_per_second": 76.357, + "eval_steps_per_second": 2.393, + "step": 182 + }, + { + "epoch": 1.25, + "eval_loss": 0.6488311290740967, + "eval_runtime": 39.2821, + "eval_samples_per_second": 76.371, + "eval_steps_per_second": 2.393, + "step": 189 + }, + { + "epoch": 1.29, + "eval_loss": 0.6458473205566406, + "eval_runtime": 39.2749, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 196 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017525083612040135, + "loss": 0.6727, + "step": 200 + }, + { + "epoch": 1.34, + "eval_loss": 0.6445983648300171, + "eval_runtime": 39.2655, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 203 + }, + { + "epoch": 1.39, + "eval_loss": 0.6414983868598938, + "eval_runtime": 39.2575, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 210 + }, + { + "epoch": 1.43, + "eval_loss": 0.6403743624687195, + "eval_runtime": 39.2601, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 217 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001725752508361204, + "loss": 0.6651, + "step": 220 + }, + { + "epoch": 1.48, + "eval_loss": 0.6375772953033447, + "eval_runtime": 39.2616, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 224 + }, + { + "epoch": 1.52, + "eval_loss": 0.6363030076026917, + "eval_runtime": 39.2685, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 231 + }, + { + "epoch": 1.57, + "eval_loss": 0.6365154981613159, + "eval_runtime": 39.2615, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 238 + }, + { + "epoch": 1.58, + "learning_rate": 0.00016989966555183946, + "loss": 0.6569, + "step": 240 + }, + { + "epoch": 1.62, + "eval_loss": 0.6351213455200195, + "eval_runtime": 39.2374, + "eval_samples_per_second": 76.458, + "eval_steps_per_second": 2.396, + "step": 245 + }, + { + "epoch": 1.66, + "eval_loss": 0.633696436882019, + "eval_runtime": 39.2576, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 252 + }, + { + "epoch": 1.71, + "eval_loss": 0.6320024132728577, + "eval_runtime": 39.2456, + "eval_samples_per_second": 76.442, + "eval_steps_per_second": 2.395, + "step": 259 + }, + { + "epoch": 1.72, + "learning_rate": 0.00016722408026755855, + "loss": 0.6535, + "step": 260 + }, + { + "epoch": 1.76, + "eval_loss": 0.6302981972694397, + "eval_runtime": 39.2723, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 266 + }, + { + "epoch": 1.8, + "eval_loss": 0.6285908818244934, + "eval_runtime": 39.2745, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 273 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001645484949832776, + "loss": 0.6504, + "step": 280 + }, + { + "epoch": 1.85, + "eval_loss": 0.6279519200325012, + "eval_runtime": 39.1978, + "eval_samples_per_second": 76.535, + "eval_steps_per_second": 2.398, + "step": 280 + }, + { + "epoch": 1.89, + "eval_loss": 0.6275761723518372, + "eval_runtime": 39.2574, + "eval_samples_per_second": 76.419, + "eval_steps_per_second": 2.394, + "step": 287 + }, + { + "epoch": 1.94, + "eval_loss": 0.6262693405151367, + "eval_runtime": 39.2587, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 294 + }, + { + "epoch": 1.98, + "learning_rate": 0.00016187290969899666, + "loss": 0.6447, + "step": 300 + }, + { + "epoch": 1.99, + "eval_loss": 0.6255723237991333, + "eval_runtime": 39.2593, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 301 + }, + { + "epoch": 2.03, + "eval_loss": 0.624893307685852, + "eval_runtime": 39.2732, + "eval_samples_per_second": 76.388, + "eval_steps_per_second": 2.393, + "step": 308 + }, + { + "epoch": 2.08, + "eval_loss": 0.6238787174224854, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 315 + }, + { + "epoch": 2.11, + "learning_rate": 0.00015919732441471574, + "loss": 0.6418, + "step": 320 + }, + { + "epoch": 2.12, + "eval_loss": 0.6227560043334961, + "eval_runtime": 39.2517, + "eval_samples_per_second": 76.43, + "eval_steps_per_second": 2.395, + "step": 322 + }, + { + "epoch": 2.17, + "eval_loss": 0.621408998966217, + "eval_runtime": 39.2673, + "eval_samples_per_second": 76.4, + "eval_steps_per_second": 2.394, + "step": 329 + }, + { + "epoch": 2.22, + "eval_loss": 0.6207154989242554, + "eval_runtime": 39.2707, + "eval_samples_per_second": 76.393, + "eval_steps_per_second": 2.394, + "step": 336 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001565217391304348, + "loss": 0.6294, + "step": 340 + }, + { + "epoch": 2.26, + "eval_loss": 0.6207785606384277, + "eval_runtime": 39.2687, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 343 + }, + { + "epoch": 2.31, + "eval_loss": 0.619699239730835, + "eval_runtime": 39.2528, + "eval_samples_per_second": 76.428, + "eval_steps_per_second": 2.395, + "step": 350 + }, + { + "epoch": 2.36, + "eval_loss": 0.6189907789230347, + "eval_runtime": 39.2383, + "eval_samples_per_second": 76.456, + "eval_steps_per_second": 2.396, + "step": 357 + }, + { + "epoch": 2.38, + "learning_rate": 0.00015384615384615385, + "loss": 0.6323, + "step": 360 + }, + { + "epoch": 2.4, + "eval_loss": 0.6188793182373047, + "eval_runtime": 39.2507, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 364 + }, + { + "epoch": 2.45, + "eval_loss": 0.6180170774459839, + "eval_runtime": 39.2506, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 371 + }, + { + "epoch": 2.49, + "eval_loss": 0.6175986528396606, + "eval_runtime": 39.2493, + "eval_samples_per_second": 76.434, + "eval_steps_per_second": 2.395, + "step": 378 + }, + { + "epoch": 2.51, + "learning_rate": 0.00015117056856187293, + "loss": 0.6194, + "step": 380 + }, + { + "epoch": 2.54, + "eval_loss": 0.6155608296394348, + "eval_runtime": 39.2549, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 385 + }, + { + "epoch": 2.59, + "eval_loss": 0.6149768829345703, + "eval_runtime": 39.2507, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 392 + }, + { + "epoch": 2.63, + "eval_loss": 0.614321768283844, + "eval_runtime": 39.2607, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 399 + }, + { + "epoch": 2.64, + "learning_rate": 0.00014849498327759196, + "loss": 0.6165, + "step": 400 + }, + { + "epoch": 2.68, + "eval_loss": 0.6136913299560547, + "eval_runtime": 39.256, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 406 + }, + { + "epoch": 2.72, + "eval_loss": 0.6127980351448059, + "eval_runtime": 39.2695, + "eval_samples_per_second": 76.395, + "eval_steps_per_second": 2.394, + "step": 413 + }, + { + "epoch": 2.77, + "learning_rate": 0.00014581939799331104, + "loss": 0.6202, + "step": 420 + }, + { + "epoch": 2.77, + "eval_loss": 0.6126558780670166, + "eval_runtime": 39.2344, + "eval_samples_per_second": 76.464, + "eval_steps_per_second": 2.396, + "step": 420 + }, + { + "epoch": 2.82, + "eval_loss": 0.6126319766044617, + "eval_runtime": 39.2692, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 427 + }, + { + "epoch": 2.86, + "eval_loss": 0.6124591827392578, + "eval_runtime": 39.2771, + "eval_samples_per_second": 76.38, + "eval_steps_per_second": 2.393, + "step": 434 + }, + { + "epoch": 2.9, + "learning_rate": 0.0001431438127090301, + "loss": 0.6186, + "step": 440 + }, + { + "epoch": 2.91, + "eval_loss": 0.6117784976959229, + "eval_runtime": 39.2494, + "eval_samples_per_second": 76.434, + "eval_steps_per_second": 2.395, + "step": 441 + }, + { + "epoch": 2.96, + "eval_loss": 0.6105948090553284, + "eval_runtime": 39.2716, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 448 + }, + { + "epoch": 3.0, + "eval_loss": 0.6107361912727356, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 455 + }, + { + "epoch": 3.04, + "learning_rate": 0.00014046822742474916, + "loss": 0.6165, + "step": 460 + }, + { + "epoch": 3.05, + "eval_loss": 0.6106633543968201, + "eval_runtime": 39.2701, + "eval_samples_per_second": 76.394, + "eval_steps_per_second": 2.394, + "step": 462 + }, + { + "epoch": 3.09, + "eval_loss": 0.6104211807250977, + "eval_runtime": 39.2794, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 469 + }, + { + "epoch": 3.14, + "eval_loss": 0.611173152923584, + "eval_runtime": 39.2596, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 476 + }, + { + "epoch": 3.17, + "learning_rate": 0.00013779264214046824, + "loss": 0.6021, + "step": 480 + }, + { + "epoch": 3.19, + "eval_loss": 0.6094884276390076, + "eval_runtime": 39.2429, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 2.395, + "step": 483 + }, + { + "epoch": 3.23, + "eval_loss": 0.6093204617500305, + "eval_runtime": 39.278, + "eval_samples_per_second": 76.379, + "eval_steps_per_second": 2.393, + "step": 490 + }, + { + "epoch": 3.28, + "eval_loss": 0.60869961977005, + "eval_runtime": 39.269, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 497 + }, + { + "epoch": 3.3, + "learning_rate": 0.0001351170568561873, + "loss": 0.6057, + "step": 500 + }, + { + "epoch": 3.33, + "eval_loss": 0.6093556880950928, + "eval_runtime": 39.2597, + "eval_samples_per_second": 76.414, + "eval_steps_per_second": 2.394, + "step": 504 + }, + { + "epoch": 3.37, + "eval_loss": 0.6078519821166992, + "eval_runtime": 39.2561, + "eval_samples_per_second": 76.421, + "eval_steps_per_second": 2.395, + "step": 511 + }, + { + "epoch": 3.42, + "eval_loss": 0.6079010367393494, + "eval_runtime": 39.2536, + "eval_samples_per_second": 76.426, + "eval_steps_per_second": 2.395, + "step": 518 + }, + { + "epoch": 3.43, + "learning_rate": 0.00013244147157190635, + "loss": 0.598, + "step": 520 + }, + { + "epoch": 3.46, + "eval_loss": 0.6074483394622803, + "eval_runtime": 39.2832, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 525 + }, + { + "epoch": 3.51, + "eval_loss": 0.6073596477508545, + "eval_runtime": 39.2701, + "eval_samples_per_second": 76.394, + "eval_steps_per_second": 2.394, + "step": 532 + }, + { + "epoch": 3.56, + "eval_loss": 0.606430172920227, + "eval_runtime": 39.2546, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 539 + }, + { + "epoch": 3.56, + "learning_rate": 0.00012976588628762543, + "loss": 0.5948, + "step": 540 + }, + { + "epoch": 3.6, + "eval_loss": 0.6060574650764465, + "eval_runtime": 39.2503, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 546 + }, + { + "epoch": 3.65, + "eval_loss": 0.6067923307418823, + "eval_runtime": 39.2654, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 553 + }, + { + "epoch": 3.69, + "learning_rate": 0.0001270903010033445, + "loss": 0.5962, + "step": 560 + }, + { + "epoch": 3.69, + "eval_loss": 0.6042212843894958, + "eval_runtime": 39.2032, + "eval_samples_per_second": 76.524, + "eval_steps_per_second": 2.398, + "step": 560 + }, + { + "epoch": 3.74, + "eval_loss": 0.6041299700737, + "eval_runtime": 39.2396, + "eval_samples_per_second": 76.453, + "eval_steps_per_second": 2.396, + "step": 567 + }, + { + "epoch": 3.79, + "eval_loss": 0.6047356128692627, + "eval_runtime": 39.274, + "eval_samples_per_second": 76.386, + "eval_steps_per_second": 2.393, + "step": 574 + }, + { + "epoch": 3.83, + "learning_rate": 0.00012441471571906357, + "loss": 0.5977, + "step": 580 + }, + { + "epoch": 3.83, + "eval_loss": 0.6040154099464417, + "eval_runtime": 39.2677, + "eval_samples_per_second": 76.399, + "eval_steps_per_second": 2.394, + "step": 581 + }, + { + "epoch": 3.88, + "eval_loss": 0.603416383266449, + "eval_runtime": 39.2621, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 588 + }, + { + "epoch": 3.93, + "eval_loss": 0.6036480069160461, + "eval_runtime": 39.2609, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 595 + }, + { + "epoch": 3.96, + "learning_rate": 0.00012173913043478263, + "loss": 0.5903, + "step": 600 + }, + { + "epoch": 3.97, + "eval_loss": 0.6035267114639282, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 602 + }, + { + "epoch": 4.02, + "eval_loss": 0.6025964617729187, + "eval_runtime": 39.2634, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 609 + }, + { + "epoch": 4.06, + "eval_loss": 0.6028868556022644, + "eval_runtime": 39.2591, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 616 + }, + { + "epoch": 4.09, + "learning_rate": 0.0001190635451505017, + "loss": 0.5927, + "step": 620 + }, + { + "epoch": 4.11, + "eval_loss": 0.6027114391326904, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 623 + }, + { + "epoch": 4.16, + "eval_loss": 0.6030986905097961, + "eval_runtime": 39.2746, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 630 + }, + { + "epoch": 4.2, + "eval_loss": 0.6026434898376465, + "eval_runtime": 39.2646, + "eval_samples_per_second": 76.405, + "eval_steps_per_second": 2.394, + "step": 637 + }, + { + "epoch": 4.22, + "learning_rate": 0.00011638795986622074, + "loss": 0.581, + "step": 640 + }, + { + "epoch": 4.25, + "eval_loss": 0.6008206009864807, + "eval_runtime": 39.2718, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 644 + }, + { + "epoch": 4.3, + "eval_loss": 0.6018855571746826, + "eval_runtime": 39.2587, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 651 + }, + { + "epoch": 4.34, + "eval_loss": 0.6018174886703491, + "eval_runtime": 39.2445, + "eval_samples_per_second": 76.444, + "eval_steps_per_second": 2.395, + "step": 658 + }, + { + "epoch": 4.35, + "learning_rate": 0.00011371237458193979, + "loss": 0.5965, + "step": 660 + }, + { + "epoch": 4.39, + "eval_loss": 0.6006762981414795, + "eval_runtime": 39.2498, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 665 + }, + { + "epoch": 4.43, + "eval_loss": 0.6006374359130859, + "eval_runtime": 39.2758, + "eval_samples_per_second": 76.383, + "eval_steps_per_second": 2.393, + "step": 672 + }, + { + "epoch": 4.48, + "eval_loss": 0.5997828245162964, + "eval_runtime": 39.2794, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 679 + }, + { + "epoch": 4.49, + "learning_rate": 0.00011103678929765886, + "loss": 0.5896, + "step": 680 + }, + { + "epoch": 4.53, + "eval_loss": 0.6000981330871582, + "eval_runtime": 39.2629, + "eval_samples_per_second": 76.408, + "eval_steps_per_second": 2.394, + "step": 686 + }, + { + "epoch": 4.57, + "eval_loss": 0.5991115570068359, + "eval_runtime": 39.2774, + "eval_samples_per_second": 76.38, + "eval_steps_per_second": 2.393, + "step": 693 + }, + { + "epoch": 4.62, + "learning_rate": 0.00010836120401337793, + "loss": 0.5854, + "step": 700 + }, + { + "epoch": 4.62, + "eval_loss": 0.6001954674720764, + "eval_runtime": 39.2333, + "eval_samples_per_second": 76.466, + "eval_steps_per_second": 2.396, + "step": 700 + }, + { + "epoch": 4.66, + "eval_loss": 0.6007575988769531, + "eval_runtime": 39.2801, + "eval_samples_per_second": 76.374, + "eval_steps_per_second": 2.393, + "step": 707 + }, + { + "epoch": 4.71, + "eval_loss": 0.5983864068984985, + "eval_runtime": 39.2469, + "eval_samples_per_second": 76.439, + "eval_steps_per_second": 2.395, + "step": 714 + }, + { + "epoch": 4.75, + "learning_rate": 0.00010568561872909698, + "loss": 0.5844, + "step": 720 + }, + { + "epoch": 4.76, + "eval_loss": 0.5985506772994995, + "eval_runtime": 39.2426, + "eval_samples_per_second": 76.448, + "eval_steps_per_second": 2.395, + "step": 721 + }, + { + "epoch": 4.8, + "eval_loss": 0.5978309512138367, + "eval_runtime": 39.2604, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 728 + }, + { + "epoch": 4.85, + "eval_loss": 0.5981310606002808, + "eval_runtime": 39.2686, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 735 + }, + { + "epoch": 4.88, + "learning_rate": 0.00010301003344481605, + "loss": 0.5784, + "step": 740 + }, + { + "epoch": 4.9, + "eval_loss": 0.5985335111618042, + "eval_runtime": 39.2557, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 742 + }, + { + "epoch": 4.94, + "eval_loss": 0.5975944995880127, + "eval_runtime": 39.2644, + "eval_samples_per_second": 76.405, + "eval_steps_per_second": 2.394, + "step": 749 + }, + { + "epoch": 4.99, + "eval_loss": 0.596754252910614, + "eval_runtime": 39.2365, + "eval_samples_per_second": 76.459, + "eval_steps_per_second": 2.396, + "step": 756 + }, + { + "epoch": 5.01, + "learning_rate": 0.00010033444816053512, + "loss": 0.5825, + "step": 760 + }, + { + "epoch": 5.03, + "eval_loss": 0.5977214574813843, + "eval_runtime": 39.235, + "eval_samples_per_second": 76.462, + "eval_steps_per_second": 2.396, + "step": 763 + }, + { + "epoch": 5.08, + "eval_loss": 0.5982287526130676, + "eval_runtime": 39.2483, + "eval_samples_per_second": 76.436, + "eval_steps_per_second": 2.395, + "step": 770 + }, + { + "epoch": 5.13, + "eval_loss": 0.5973477959632874, + "eval_runtime": 39.2692, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 777 + }, + { + "epoch": 5.15, + "learning_rate": 9.765886287625419e-05, + "loss": 0.5724, + "step": 780 + }, + { + "epoch": 5.17, + "eval_loss": 0.598833441734314, + "eval_runtime": 39.2608, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 784 + }, + { + "epoch": 5.22, + "eval_loss": 0.5973609089851379, + "eval_runtime": 39.2557, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 791 + }, + { + "epoch": 5.27, + "eval_loss": 0.5983055233955383, + "eval_runtime": 39.2613, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 798 + }, + { + "epoch": 5.28, + "learning_rate": 9.498327759197325e-05, + "loss": 0.5765, + "step": 800 + }, + { + "epoch": 5.31, + "eval_loss": 0.597219705581665, + "eval_runtime": 39.2532, + "eval_samples_per_second": 76.427, + "eval_steps_per_second": 2.395, + "step": 805 + }, + { + "epoch": 5.36, + "eval_loss": 0.5974920392036438, + "eval_runtime": 39.2428, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 2.395, + "step": 812 + }, + { + "epoch": 5.4, + "eval_loss": 0.5970295667648315, + "eval_runtime": 39.2255, + "eval_samples_per_second": 76.481, + "eval_steps_per_second": 2.396, + "step": 819 + }, + { + "epoch": 5.41, + "learning_rate": 9.230769230769232e-05, + "loss": 0.5662, + "step": 820 + }, + { + "epoch": 5.45, + "eval_loss": 0.5995200872421265, + "eval_runtime": 39.2763, + "eval_samples_per_second": 76.382, + "eval_steps_per_second": 2.393, + "step": 826 + }, + { + "epoch": 5.5, + "eval_loss": 0.5961365699768066, + "eval_runtime": 39.2442, + "eval_samples_per_second": 76.444, + "eval_steps_per_second": 2.395, + "step": 833 + }, + { + "epoch": 5.54, + "learning_rate": 8.963210702341137e-05, + "loss": 0.5594, + "step": 840 + }, + { + "epoch": 5.54, + "eval_loss": 0.5958811640739441, + "eval_runtime": 39.224, + "eval_samples_per_second": 76.484, + "eval_steps_per_second": 2.396, + "step": 840 + }, + { + "epoch": 5.59, + "eval_loss": 0.5974062085151672, + "eval_runtime": 39.2479, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 847 + }, + { + "epoch": 5.63, + "eval_loss": 0.5959305167198181, + "eval_runtime": 39.1122, + "eval_samples_per_second": 76.702, + "eval_steps_per_second": 2.403, + "step": 854 + }, + { + "epoch": 5.67, + "learning_rate": 8.695652173913044e-05, + "loss": 0.5569, + "step": 860 + }, + { + "epoch": 5.68, + "eval_loss": 0.597082257270813, + "eval_runtime": 39.2419, + "eval_samples_per_second": 76.449, + "eval_steps_per_second": 2.395, + "step": 861 + }, + { + "epoch": 5.73, + "eval_loss": 0.5964935421943665, + "eval_runtime": 39.2482, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 868 + }, + { + "epoch": 5.77, + "eval_loss": 0.596628725528717, + "eval_runtime": 39.2684, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 875 + }, + { + "epoch": 5.81, + "learning_rate": 8.42809364548495e-05, + "loss": 0.5711, + "step": 880 + }, + { + "epoch": 5.82, + "eval_loss": 0.596688449382782, + "eval_runtime": 39.262, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 882 + }, + { + "epoch": 5.87, + "eval_loss": 0.5974501967430115, + "eval_runtime": 39.2621, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 889 + }, + { + "epoch": 5.91, + "eval_loss": 0.5951861143112183, + "eval_runtime": 39.2622, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 896 + }, + { + "epoch": 5.94, + "learning_rate": 8.160535117056857e-05, + "loss": 0.5703, + "step": 900 + }, + { + "epoch": 5.96, + "eval_loss": 0.5963322520256042, + "eval_runtime": 39.2656, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 903 + }, + { + "epoch": 6.0, + "eval_loss": 0.5958115458488464, + "eval_runtime": 39.2804, + "eval_samples_per_second": 76.374, + "eval_steps_per_second": 2.393, + "step": 910 + }, + { + "epoch": 6.05, + "eval_loss": 0.5968443155288696, + "eval_runtime": 39.2618, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 917 + }, + { + "epoch": 6.07, + "learning_rate": 7.892976588628763e-05, + "loss": 0.5551, + "step": 920 + }, + { + "epoch": 6.1, + "eval_loss": 0.5958288311958313, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 924 + }, + { + "epoch": 6.14, + "eval_loss": 0.5968209505081177, + "eval_runtime": 39.2563, + "eval_samples_per_second": 76.421, + "eval_steps_per_second": 2.395, + "step": 931 + }, + { + "epoch": 6.19, + "eval_loss": 0.5957658886909485, + "eval_runtime": 39.2499, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 938 + }, + { + "epoch": 6.2, + "learning_rate": 7.62541806020067e-05, + "loss": 0.5636, + "step": 940 + }, + { + "epoch": 6.24, + "eval_loss": 0.5955784916877747, + "eval_runtime": 39.279, + "eval_samples_per_second": 76.377, + "eval_steps_per_second": 2.393, + "step": 945 + }, + { + "epoch": 6.28, + "eval_loss": 0.5963084101676941, + "eval_runtime": 39.2656, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 952 + }, + { + "epoch": 6.33, + "eval_loss": 0.595792829990387, + "eval_runtime": 39.2577, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 959 + }, + { + "epoch": 6.33, + "learning_rate": 7.357859531772575e-05, + "loss": 0.5676, + "step": 960 + }, + { + "epoch": 6.37, + "eval_loss": 0.5953949093818665, + "eval_runtime": 39.2554, + "eval_samples_per_second": 76.423, + "eval_steps_per_second": 2.395, + "step": 966 + }, + { + "epoch": 6.42, + "eval_loss": 0.595146894454956, + "eval_runtime": 39.2386, + "eval_samples_per_second": 76.455, + "eval_steps_per_second": 2.396, + "step": 973 + }, + { + "epoch": 6.47, + "learning_rate": 7.090301003344481e-05, + "loss": 0.5551, + "step": 980 + }, + { + "epoch": 6.47, + "eval_loss": 0.5957517027854919, + "eval_runtime": 39.197, + "eval_samples_per_second": 76.536, + "eval_steps_per_second": 2.398, + "step": 980 + }, + { + "epoch": 6.51, + "eval_loss": 0.596603512763977, + "eval_runtime": 39.2315, + "eval_samples_per_second": 76.469, + "eval_steps_per_second": 2.396, + "step": 987 + }, + { + "epoch": 6.56, + "eval_loss": 0.5952173471450806, + "eval_runtime": 39.2393, + "eval_samples_per_second": 76.454, + "eval_steps_per_second": 2.396, + "step": 994 + }, + { + "epoch": 6.6, + "learning_rate": 6.822742474916388e-05, + "loss": 0.5539, + "step": 1000 + }, + { + "epoch": 6.6, + "eval_loss": 0.5954132676124573, + "eval_runtime": 39.2213, + "eval_samples_per_second": 76.489, + "eval_steps_per_second": 2.397, + "step": 1001 + }, + { + "epoch": 6.65, + "eval_loss": 0.5956953167915344, + "eval_runtime": 39.2503, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 1008 + }, + { + "epoch": 6.7, + "eval_loss": 0.5959665775299072, + "eval_runtime": 39.2657, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 1015 + }, + { + "epoch": 6.73, + "learning_rate": 6.555183946488295e-05, + "loss": 0.5607, + "step": 1020 + }, + { + "epoch": 6.74, + "eval_loss": 0.5952425003051758, + "eval_runtime": 39.2705, + "eval_samples_per_second": 76.393, + "eval_steps_per_second": 2.394, + "step": 1022 + }, + { + "epoch": 6.79, + "eval_loss": 0.5953785181045532, + "eval_runtime": 39.2403, + "eval_samples_per_second": 76.452, + "eval_steps_per_second": 2.395, + "step": 1029 + }, + { + "epoch": 6.84, + "eval_loss": 0.5939880609512329, + "eval_runtime": 39.2586, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 1036 + }, + { + "epoch": 6.86, + "learning_rate": 6.287625418060201e-05, + "loss": 0.5535, + "step": 1040 + }, + { + "epoch": 6.88, + "eval_loss": 0.5965318083763123, + "eval_runtime": 39.2698, + "eval_samples_per_second": 76.395, + "eval_steps_per_second": 2.394, + "step": 1043 + }, + { + "epoch": 6.93, + "eval_loss": 0.5945996642112732, + "eval_runtime": 39.2883, + "eval_samples_per_second": 76.359, + "eval_steps_per_second": 2.393, + "step": 1050 + }, + { + "epoch": 6.97, + "eval_loss": 0.5955180525779724, + "eval_runtime": 39.2482, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 1057 + }, + { + "epoch": 6.99, + "learning_rate": 6.0200668896321076e-05, + "loss": 0.5488, + "step": 1060 + }, + { + "epoch": 7.02, + "eval_loss": 0.5957381725311279, + "eval_runtime": 39.2638, + "eval_samples_per_second": 76.406, + "eval_steps_per_second": 2.394, + "step": 1064 + }, + { + "epoch": 7.07, + "eval_loss": 0.595982551574707, + "eval_runtime": 39.2492, + "eval_samples_per_second": 76.435, + "eval_steps_per_second": 2.395, + "step": 1071 + }, + { + "epoch": 7.11, + "eval_loss": 0.5966017842292786, + "eval_runtime": 39.2599, + "eval_samples_per_second": 76.414, + "eval_steps_per_second": 2.394, + "step": 1078 + }, + { + "epoch": 7.13, + "learning_rate": 5.752508361204013e-05, + "loss": 0.5397, + "step": 1080 + }, + { + "epoch": 7.16, + "eval_loss": 0.5957372784614563, + "eval_runtime": 39.2429, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 2.395, + "step": 1085 + }, + { + "epoch": 7.2, + "eval_loss": 0.5966220498085022, + "eval_runtime": 39.2131, + "eval_samples_per_second": 76.505, + "eval_steps_per_second": 2.397, + "step": 1092 + }, + { + "epoch": 7.25, + "eval_loss": 0.5961087346076965, + "eval_runtime": 39.2548, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 1099 + }, + { + "epoch": 7.26, + "learning_rate": 5.4849498327759194e-05, + "loss": 0.5478, + "step": 1100 + }, + { + "epoch": 7.3, + "eval_loss": 0.5953994393348694, + "eval_runtime": 39.2497, + "eval_samples_per_second": 76.434, + "eval_steps_per_second": 2.395, + "step": 1106 + }, + { + "epoch": 7.34, + "eval_loss": 0.5952059626579285, + "eval_runtime": 39.2593, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 1113 + }, + { + "epoch": 7.39, + "learning_rate": 5.217391304347826e-05, + "loss": 0.5443, + "step": 1120 + }, + { + "epoch": 7.39, + "eval_loss": 0.5956901907920837, + "eval_runtime": 39.2077, + "eval_samples_per_second": 76.516, + "eval_steps_per_second": 2.397, + "step": 1120 + }, + { + "epoch": 7.44, + "eval_loss": 0.595130980014801, + "eval_runtime": 39.2681, + "eval_samples_per_second": 76.398, + "eval_steps_per_second": 2.394, + "step": 1127 + }, + { + "epoch": 7.48, + "eval_loss": 0.5952557921409607, + "eval_runtime": 39.2635, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 1134 + }, + { + "epoch": 7.52, + "learning_rate": 4.9498327759197325e-05, + "loss": 0.5482, + "step": 1140 + }, + { + "epoch": 7.53, + "eval_loss": 0.5957027077674866, + "eval_runtime": 39.2636, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 1141 + }, + { + "epoch": 7.57, + "eval_loss": 0.5959120988845825, + "eval_runtime": 39.2726, + "eval_samples_per_second": 76.389, + "eval_steps_per_second": 2.394, + "step": 1148 + }, + { + "epoch": 7.62, + "eval_loss": 0.5966920852661133, + "eval_runtime": 39.2671, + "eval_samples_per_second": 76.4, + "eval_steps_per_second": 2.394, + "step": 1155 + }, + { + "epoch": 7.65, + "learning_rate": 4.6822742474916394e-05, + "loss": 0.5398, + "step": 1160 + }, + { + "epoch": 7.67, + "eval_loss": 0.5955999493598938, + "eval_runtime": 39.2344, + "eval_samples_per_second": 76.464, + "eval_steps_per_second": 2.396, + "step": 1162 + }, + { + "epoch": 7.71, + "eval_loss": 0.5959904193878174, + "eval_runtime": 39.2449, + "eval_samples_per_second": 76.443, + "eval_steps_per_second": 2.395, + "step": 1169 + }, + { + "epoch": 7.76, + "eval_loss": 0.595309853553772, + "eval_runtime": 39.2603, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 1176 + }, + { + "epoch": 7.79, + "learning_rate": 4.414715719063545e-05, + "loss": 0.5405, + "step": 1180 + }, + { + "epoch": 7.81, + "eval_loss": 0.5953694581985474, + "eval_runtime": 39.25, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 1183 + }, + { + "epoch": 7.85, + "eval_loss": 0.5948453545570374, + "eval_runtime": 39.2551, + "eval_samples_per_second": 76.423, + "eval_steps_per_second": 2.395, + "step": 1190 + }, + { + "epoch": 7.9, + "eval_loss": 0.596403956413269, + "eval_runtime": 39.2502, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 1197 + }, + { + "epoch": 7.92, + "learning_rate": 4.147157190635452e-05, + "loss": 0.5454, + "step": 1200 + }, + { + "epoch": 7.94, + "eval_loss": 0.594862699508667, + "eval_runtime": 39.2569, + "eval_samples_per_second": 76.42, + "eval_steps_per_second": 2.394, + "step": 1204 + }, + { + "epoch": 7.99, + "eval_loss": 0.5970881581306458, + "eval_runtime": 39.2602, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 1211 + }, + { + "epoch": 8.04, + "eval_loss": 0.595177173614502, + "eval_runtime": 39.2663, + "eval_samples_per_second": 76.401, + "eval_steps_per_second": 2.394, + "step": 1218 + }, + { + "epoch": 8.05, + "learning_rate": 3.879598662207358e-05, + "loss": 0.5407, + "step": 1220 + }, + { + "epoch": 8.08, + "eval_loss": 0.5961460471153259, + "eval_runtime": 39.2651, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 1225 + }, + { + "epoch": 8.13, + "eval_loss": 0.5952489972114563, + "eval_runtime": 39.2417, + "eval_samples_per_second": 76.449, + "eval_steps_per_second": 2.395, + "step": 1232 + }, + { + "epoch": 8.17, + "eval_loss": 0.5965322852134705, + "eval_runtime": 39.244, + "eval_samples_per_second": 76.445, + "eval_steps_per_second": 2.395, + "step": 1239 + }, + { + "epoch": 8.18, + "learning_rate": 3.612040133779264e-05, + "loss": 0.5397, + "step": 1240 + }, + { + "epoch": 8.22, + "eval_loss": 0.5953331589698792, + "eval_runtime": 39.242, + "eval_samples_per_second": 76.449, + "eval_steps_per_second": 2.395, + "step": 1246 + }, + { + "epoch": 8.27, + "eval_loss": 0.5962971448898315, + "eval_runtime": 39.2251, + "eval_samples_per_second": 76.482, + "eval_steps_per_second": 2.396, + "step": 1253 + }, + { + "epoch": 8.31, + "learning_rate": 3.3444816053511705e-05, + "loss": 0.5456, + "step": 1260 + }, + { + "epoch": 8.31, + "eval_loss": 0.5961341261863708, + "eval_runtime": 39.2121, + "eval_samples_per_second": 76.507, + "eval_steps_per_second": 2.397, + "step": 1260 + }, + { + "epoch": 8.36, + "eval_loss": 0.595777153968811, + "eval_runtime": 39.2544, + "eval_samples_per_second": 76.425, + "eval_steps_per_second": 2.395, + "step": 1267 + }, + { + "epoch": 8.41, + "eval_loss": 0.5961927771568298, + "eval_runtime": 39.2566, + "eval_samples_per_second": 76.42, + "eval_steps_per_second": 2.395, + "step": 1274 + }, + { + "epoch": 8.45, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.5361, + "step": 1280 + }, + { + "epoch": 8.45, + "eval_loss": 0.5967215895652771, + "eval_runtime": 39.251, + "eval_samples_per_second": 76.431, + "eval_steps_per_second": 2.395, + "step": 1281 + }, + { + "epoch": 8.5, + "eval_loss": 0.5971426367759705, + "eval_runtime": 39.1956, + "eval_samples_per_second": 76.539, + "eval_steps_per_second": 2.398, + "step": 1288 + }, + { + "epoch": 8.54, + "eval_loss": 0.5957850217819214, + "eval_runtime": 39.1348, + "eval_samples_per_second": 76.658, + "eval_steps_per_second": 2.402, + "step": 1295 + }, + { + "epoch": 8.58, + "learning_rate": 2.8093645484949833e-05, + "loss": 0.5299, + "step": 1300 + }, + { + "epoch": 8.59, + "eval_loss": 0.5976316928863525, + "eval_runtime": 39.248, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 1302 + }, + { + "epoch": 8.64, + "eval_loss": 0.5961042642593384, + "eval_runtime": 39.2812, + "eval_samples_per_second": 76.372, + "eval_steps_per_second": 2.393, + "step": 1309 + }, + { + "epoch": 8.68, + "eval_loss": 0.5960709452629089, + "eval_runtime": 39.1595, + "eval_samples_per_second": 76.61, + "eval_steps_per_second": 2.4, + "step": 1316 + }, + { + "epoch": 8.71, + "learning_rate": 2.54180602006689e-05, + "loss": 0.5352, + "step": 1320 + }, + { + "epoch": 8.73, + "eval_loss": 0.5965536236763, + "eval_runtime": 39.2514, + "eval_samples_per_second": 76.43, + "eval_steps_per_second": 2.395, + "step": 1323 + }, + { + "epoch": 8.78, + "eval_loss": 0.5955998301506042, + "eval_runtime": 39.2608, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 1330 + }, + { + "epoch": 8.82, + "eval_loss": 0.5966908931732178, + "eval_runtime": 39.2659, + "eval_samples_per_second": 76.402, + "eval_steps_per_second": 2.394, + "step": 1337 + }, + { + "epoch": 8.84, + "learning_rate": 2.274247491638796e-05, + "loss": 0.5287, + "step": 1340 + }, + { + "epoch": 8.87, + "eval_loss": 0.5965719223022461, + "eval_runtime": 39.2743, + "eval_samples_per_second": 76.386, + "eval_steps_per_second": 2.393, + "step": 1344 + }, + { + "epoch": 8.91, + "eval_loss": 0.5965744853019714, + "eval_runtime": 39.2692, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 1351 + }, + { + "epoch": 8.96, + "eval_loss": 0.5964776277542114, + "eval_runtime": 39.2476, + "eval_samples_per_second": 76.438, + "eval_steps_per_second": 2.395, + "step": 1358 + }, + { + "epoch": 8.97, + "learning_rate": 2.0066889632107023e-05, + "loss": 0.5349, + "step": 1360 + }, + { + "epoch": 9.01, + "eval_loss": 0.5952075719833374, + "eval_runtime": 39.2647, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 1365 + }, + { + "epoch": 9.05, + "eval_loss": 0.5961645841598511, + "eval_runtime": 39.268, + "eval_samples_per_second": 76.398, + "eval_steps_per_second": 2.394, + "step": 1372 + }, + { + "epoch": 9.1, + "eval_loss": 0.5963994264602661, + "eval_runtime": 39.2348, + "eval_samples_per_second": 76.463, + "eval_steps_per_second": 2.396, + "step": 1379 + }, + { + "epoch": 9.11, + "learning_rate": 1.739130434782609e-05, + "loss": 0.5325, + "step": 1380 + }, + { + "epoch": 9.14, + "eval_loss": 0.5961940884590149, + "eval_runtime": 39.2561, + "eval_samples_per_second": 76.421, + "eval_steps_per_second": 2.395, + "step": 1386 + }, + { + "epoch": 9.19, + "eval_loss": 0.5965219140052795, + "eval_runtime": 39.211, + "eval_samples_per_second": 76.509, + "eval_steps_per_second": 2.397, + "step": 1393 + }, + { + "epoch": 9.24, + "learning_rate": 1.4715719063545153e-05, + "loss": 0.5337, + "step": 1400 + }, + { + "epoch": 9.24, + "eval_loss": 0.5960851311683655, + "eval_runtime": 39.1812, + "eval_samples_per_second": 76.567, + "eval_steps_per_second": 2.399, + "step": 1400 + }, + { + "epoch": 9.28, + "eval_loss": 0.5964084267616272, + "eval_runtime": 39.2326, + "eval_samples_per_second": 76.467, + "eval_steps_per_second": 2.396, + "step": 1407 + }, + { + "epoch": 9.33, + "eval_loss": 0.596020519733429, + "eval_runtime": 39.2491, + "eval_samples_per_second": 76.435, + "eval_steps_per_second": 2.395, + "step": 1414 + }, + { + "epoch": 9.37, + "learning_rate": 1.2040133779264215e-05, + "loss": 0.5304, + "step": 1420 + }, + { + "epoch": 9.38, + "eval_loss": 0.5962061285972595, + "eval_runtime": 39.2549, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 1421 + }, + { + "epoch": 9.42, + "eval_loss": 0.5967465043067932, + "eval_runtime": 39.2538, + "eval_samples_per_second": 76.426, + "eval_steps_per_second": 2.395, + "step": 1428 + }, + { + "epoch": 9.47, + "eval_loss": 0.5967243909835815, + "eval_runtime": 39.2606, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 1435 + }, + { + "epoch": 9.5, + "learning_rate": 9.364548494983277e-06, + "loss": 0.5326, + "step": 1440 + }, + { + "epoch": 9.51, + "eval_loss": 0.5967350602149963, + "eval_runtime": 39.2525, + "eval_samples_per_second": 76.428, + "eval_steps_per_second": 2.395, + "step": 1442 + }, + { + "epoch": 9.56, + "eval_loss": 0.5968228578567505, + "eval_runtime": 39.236, + "eval_samples_per_second": 76.46, + "eval_steps_per_second": 2.396, + "step": 1449 + }, + { + "epoch": 9.61, + "eval_loss": 0.5971869230270386, + "eval_runtime": 39.2598, + "eval_samples_per_second": 76.414, + "eval_steps_per_second": 2.394, + "step": 1456 + }, + { + "epoch": 9.63, + "learning_rate": 6.688963210702341e-06, + "loss": 0.527, + "step": 1460 + }, + { + "epoch": 9.65, + "eval_loss": 0.5972290635108948, + "eval_runtime": 39.2414, + "eval_samples_per_second": 76.45, + "eval_steps_per_second": 2.395, + "step": 1463 + }, + { + "epoch": 9.7, + "eval_loss": 0.5970906019210815, + "eval_runtime": 39.2764, + "eval_samples_per_second": 76.382, + "eval_steps_per_second": 2.393, + "step": 1470 + }, + { + "epoch": 9.75, + "eval_loss": 0.5970170497894287, + "eval_runtime": 39.2685, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 1477 + }, + { + "epoch": 9.76, + "learning_rate": 4.013377926421405e-06, + "loss": 0.5276, + "step": 1480 + }, + { + "epoch": 9.79, + "eval_loss": 0.5967093110084534, + "eval_runtime": 39.2546, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 1484 + }, + { + "epoch": 9.84, + "eval_loss": 0.5967251658439636, + "eval_runtime": 39.276, + "eval_samples_per_second": 76.382, + "eval_steps_per_second": 2.393, + "step": 1491 + }, + { + "epoch": 9.88, + "eval_loss": 0.5968044996261597, + "eval_runtime": 39.2838, + "eval_samples_per_second": 76.367, + "eval_steps_per_second": 2.393, + "step": 1498 + } + ], + "max_steps": 1510, + "num_train_epochs": 10, + "total_flos": 1.3056455848308507e+19, + "trial_name": null, + "trial_params": null +} diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/training_args.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/training_args.bin new file mode 100644 index 0000000..ecaa6ea --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1498/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc36676b8e75725a5f1d435df06d955098ecfd2dcf5fc632f668dfc3b7a43333 +size 4027 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/adapter_config.json b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/adapter_config.json new file mode 100644 index 0000000..c5607df --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "/mnt/data1/sheshuaijie/Data/PLM/vicuna-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/adapter_model.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/adapter_model.bin new file mode 100644 index 0000000..8710093 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/optimizer.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/optimizer.pt new file mode 100644 index 0000000..fbc269e --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:600984ca73a7bcc82d6e506f83e7ea45c58242b907c95801a51ca7cf628a944d +size 209810181 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/pytorch_model.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/pytorch_model.bin new file mode 100644 index 0000000..fade814 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc81ad838ec66e35bfeaa2d344f465fa25c26f759c9f5da6603b7dbfba1c707 +size 104915277 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_0.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_0.pth new file mode 100644 index 0000000..929ca13 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:099ca4487ecb97b228d9c4d856d7cb40e78143486e3926d363eb14dac0abff9c +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_1.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_1.pth new file mode 100644 index 0000000..5ad7fed --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5e5185bd3bc1ae8afe44cddd6043bf42ba9dba5e8235b006a13345104d82aa +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_2.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_2.pth new file mode 100644 index 0000000..f678c71 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5f76e77983866613863b903be722e030054d9ae8710f3ab58224ede927d3aeb +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_3.pth b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_3.pth new file mode 100644 index 0000000..299f27e --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c40c667796940f73ff28232e62ef528a856b2c70a52c68460694468f88035fb8 +size 17655 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/scaler.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/scaler.pt new file mode 100644 index 0000000..8d2ed00 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5825b874b4f341241d4fd914ad00c8a60bb6b7ba4b42e2f3651cdc5c47332c4 +size 557 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/scheduler.pt b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/scheduler.pt new file mode 100644 index 0000000..9a8310c --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:410b88655bf978bcc67ca9dedbd5d0efe4960796f940d7db489de71cbc92a331 +size 627 diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/trainer_state.json b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/trainer_state.json new file mode 100644 index 0000000..4687473 --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/trainer_state.json @@ -0,0 +1,2186 @@ +{ + "best_metric": 0.5939880609512329, + "best_model_checkpoint": "/mnt/data1/sheshuaijie/Output/CoT/Trained/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1036", + "epoch": 9.929896907216495, + "global_step": 1505, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "eval_loss": 1.7208857536315918, + "eval_runtime": 39.046, + "eval_samples_per_second": 76.832, + "eval_steps_per_second": 2.407, + "step": 7 + }, + { + "epoch": 0.09, + "eval_loss": 1.3302656412124634, + "eval_runtime": 39.1446, + "eval_samples_per_second": 76.639, + "eval_steps_per_second": 2.401, + "step": 14 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019933110367892977, + "loss": 1.607, + "step": 20 + }, + { + "epoch": 0.14, + "eval_loss": 1.0993696451187134, + "eval_runtime": 39.2624, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 21 + }, + { + "epoch": 0.18, + "eval_loss": 0.9883869886398315, + "eval_runtime": 39.2607, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 28 + }, + { + "epoch": 0.23, + "eval_loss": 0.9121341109275818, + "eval_runtime": 39.2818, + "eval_samples_per_second": 76.371, + "eval_steps_per_second": 2.393, + "step": 35 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019665551839464883, + "loss": 1.0077, + "step": 40 + }, + { + "epoch": 0.28, + "eval_loss": 0.8665392398834229, + "eval_runtime": 39.261, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 42 + }, + { + "epoch": 0.32, + "eval_loss": 0.8299428820610046, + "eval_runtime": 39.2723, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 49 + }, + { + "epoch": 0.37, + "eval_loss": 0.7965301275253296, + "eval_runtime": 39.2718, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 56 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001939799331103679, + "loss": 0.8626, + "step": 60 + }, + { + "epoch": 0.42, + "eval_loss": 0.7661889791488647, + "eval_runtime": 39.2752, + "eval_samples_per_second": 76.384, + "eval_steps_per_second": 2.393, + "step": 63 + }, + { + "epoch": 0.46, + "eval_loss": 0.744417130947113, + "eval_runtime": 39.2899, + "eval_samples_per_second": 76.355, + "eval_steps_per_second": 2.392, + "step": 70 + }, + { + "epoch": 0.51, + "eval_loss": 0.728394627571106, + "eval_runtime": 39.298, + "eval_samples_per_second": 76.34, + "eval_steps_per_second": 2.392, + "step": 77 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019130434782608697, + "loss": 0.7683, + "step": 80 + }, + { + "epoch": 0.55, + "eval_loss": 0.7151542901992798, + "eval_runtime": 39.272, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 84 + }, + { + "epoch": 0.6, + "eval_loss": 0.7049417495727539, + "eval_runtime": 39.2657, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 91 + }, + { + "epoch": 0.65, + "eval_loss": 0.6961150765419006, + "eval_runtime": 39.2274, + "eval_samples_per_second": 76.477, + "eval_steps_per_second": 2.396, + "step": 98 + }, + { + "epoch": 0.66, + "learning_rate": 0.00018862876254180605, + "loss": 0.7346, + "step": 100 + }, + { + "epoch": 0.69, + "eval_loss": 0.6891586780548096, + "eval_runtime": 39.2698, + "eval_samples_per_second": 76.395, + "eval_steps_per_second": 2.394, + "step": 105 + }, + { + "epoch": 0.74, + "eval_loss": 0.6833620667457581, + "eval_runtime": 39.2474, + "eval_samples_per_second": 76.438, + "eval_steps_per_second": 2.395, + "step": 112 + }, + { + "epoch": 0.79, + "eval_loss": 0.678981602191925, + "eval_runtime": 39.2363, + "eval_samples_per_second": 76.46, + "eval_steps_per_second": 2.396, + "step": 119 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001859531772575251, + "loss": 0.7095, + "step": 120 + }, + { + "epoch": 0.83, + "eval_loss": 0.6739740967750549, + "eval_runtime": 39.2467, + "eval_samples_per_second": 76.439, + "eval_steps_per_second": 2.395, + "step": 126 + }, + { + "epoch": 0.88, + "eval_loss": 0.6704814434051514, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 133 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018327759197324413, + "loss": 0.6989, + "step": 140 + }, + { + "epoch": 0.92, + "eval_loss": 0.6668062210083008, + "eval_runtime": 39.1861, + "eval_samples_per_second": 76.558, + "eval_steps_per_second": 2.399, + "step": 140 + }, + { + "epoch": 0.97, + "eval_loss": 0.6635003089904785, + "eval_runtime": 39.2627, + "eval_samples_per_second": 76.408, + "eval_steps_per_second": 2.394, + "step": 147 + }, + { + "epoch": 1.02, + "eval_loss": 0.6594184637069702, + "eval_runtime": 39.2634, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 154 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018060200668896322, + "loss": 0.6753, + "step": 160 + }, + { + "epoch": 1.06, + "eval_loss": 0.656818151473999, + "eval_runtime": 39.2093, + "eval_samples_per_second": 76.513, + "eval_steps_per_second": 2.397, + "step": 161 + }, + { + "epoch": 1.11, + "eval_loss": 0.6542237401008606, + "eval_runtime": 39.2619, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 168 + }, + { + "epoch": 1.15, + "eval_loss": 0.6509793400764465, + "eval_runtime": 39.2795, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 175 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017792642140468227, + "loss": 0.6742, + "step": 180 + }, + { + "epoch": 1.2, + "eval_loss": 0.6501123905181885, + "eval_runtime": 39.2889, + "eval_samples_per_second": 76.357, + "eval_steps_per_second": 2.393, + "step": 182 + }, + { + "epoch": 1.25, + "eval_loss": 0.6488311290740967, + "eval_runtime": 39.2821, + "eval_samples_per_second": 76.371, + "eval_steps_per_second": 2.393, + "step": 189 + }, + { + "epoch": 1.29, + "eval_loss": 0.6458473205566406, + "eval_runtime": 39.2749, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 196 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017525083612040135, + "loss": 0.6727, + "step": 200 + }, + { + "epoch": 1.34, + "eval_loss": 0.6445983648300171, + "eval_runtime": 39.2655, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 203 + }, + { + "epoch": 1.39, + "eval_loss": 0.6414983868598938, + "eval_runtime": 39.2575, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 210 + }, + { + "epoch": 1.43, + "eval_loss": 0.6403743624687195, + "eval_runtime": 39.2601, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 217 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001725752508361204, + "loss": 0.6651, + "step": 220 + }, + { + "epoch": 1.48, + "eval_loss": 0.6375772953033447, + "eval_runtime": 39.2616, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 224 + }, + { + "epoch": 1.52, + "eval_loss": 0.6363030076026917, + "eval_runtime": 39.2685, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 231 + }, + { + "epoch": 1.57, + "eval_loss": 0.6365154981613159, + "eval_runtime": 39.2615, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 238 + }, + { + "epoch": 1.58, + "learning_rate": 0.00016989966555183946, + "loss": 0.6569, + "step": 240 + }, + { + "epoch": 1.62, + "eval_loss": 0.6351213455200195, + "eval_runtime": 39.2374, + "eval_samples_per_second": 76.458, + "eval_steps_per_second": 2.396, + "step": 245 + }, + { + "epoch": 1.66, + "eval_loss": 0.633696436882019, + "eval_runtime": 39.2576, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 252 + }, + { + "epoch": 1.71, + "eval_loss": 0.6320024132728577, + "eval_runtime": 39.2456, + "eval_samples_per_second": 76.442, + "eval_steps_per_second": 2.395, + "step": 259 + }, + { + "epoch": 1.72, + "learning_rate": 0.00016722408026755855, + "loss": 0.6535, + "step": 260 + }, + { + "epoch": 1.76, + "eval_loss": 0.6302981972694397, + "eval_runtime": 39.2723, + "eval_samples_per_second": 76.39, + "eval_steps_per_second": 2.394, + "step": 266 + }, + { + "epoch": 1.8, + "eval_loss": 0.6285908818244934, + "eval_runtime": 39.2745, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 273 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001645484949832776, + "loss": 0.6504, + "step": 280 + }, + { + "epoch": 1.85, + "eval_loss": 0.6279519200325012, + "eval_runtime": 39.1978, + "eval_samples_per_second": 76.535, + "eval_steps_per_second": 2.398, + "step": 280 + }, + { + "epoch": 1.89, + "eval_loss": 0.6275761723518372, + "eval_runtime": 39.2574, + "eval_samples_per_second": 76.419, + "eval_steps_per_second": 2.394, + "step": 287 + }, + { + "epoch": 1.94, + "eval_loss": 0.6262693405151367, + "eval_runtime": 39.2587, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 294 + }, + { + "epoch": 1.98, + "learning_rate": 0.00016187290969899666, + "loss": 0.6447, + "step": 300 + }, + { + "epoch": 1.99, + "eval_loss": 0.6255723237991333, + "eval_runtime": 39.2593, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 301 + }, + { + "epoch": 2.03, + "eval_loss": 0.624893307685852, + "eval_runtime": 39.2732, + "eval_samples_per_second": 76.388, + "eval_steps_per_second": 2.393, + "step": 308 + }, + { + "epoch": 2.08, + "eval_loss": 0.6238787174224854, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 315 + }, + { + "epoch": 2.11, + "learning_rate": 0.00015919732441471574, + "loss": 0.6418, + "step": 320 + }, + { + "epoch": 2.12, + "eval_loss": 0.6227560043334961, + "eval_runtime": 39.2517, + "eval_samples_per_second": 76.43, + "eval_steps_per_second": 2.395, + "step": 322 + }, + { + "epoch": 2.17, + "eval_loss": 0.621408998966217, + "eval_runtime": 39.2673, + "eval_samples_per_second": 76.4, + "eval_steps_per_second": 2.394, + "step": 329 + }, + { + "epoch": 2.22, + "eval_loss": 0.6207154989242554, + "eval_runtime": 39.2707, + "eval_samples_per_second": 76.393, + "eval_steps_per_second": 2.394, + "step": 336 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001565217391304348, + "loss": 0.6294, + "step": 340 + }, + { + "epoch": 2.26, + "eval_loss": 0.6207785606384277, + "eval_runtime": 39.2687, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 343 + }, + { + "epoch": 2.31, + "eval_loss": 0.619699239730835, + "eval_runtime": 39.2528, + "eval_samples_per_second": 76.428, + "eval_steps_per_second": 2.395, + "step": 350 + }, + { + "epoch": 2.36, + "eval_loss": 0.6189907789230347, + "eval_runtime": 39.2383, + "eval_samples_per_second": 76.456, + "eval_steps_per_second": 2.396, + "step": 357 + }, + { + "epoch": 2.38, + "learning_rate": 0.00015384615384615385, + "loss": 0.6323, + "step": 360 + }, + { + "epoch": 2.4, + "eval_loss": 0.6188793182373047, + "eval_runtime": 39.2507, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 364 + }, + { + "epoch": 2.45, + "eval_loss": 0.6180170774459839, + "eval_runtime": 39.2506, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 371 + }, + { + "epoch": 2.49, + "eval_loss": 0.6175986528396606, + "eval_runtime": 39.2493, + "eval_samples_per_second": 76.434, + "eval_steps_per_second": 2.395, + "step": 378 + }, + { + "epoch": 2.51, + "learning_rate": 0.00015117056856187293, + "loss": 0.6194, + "step": 380 + }, + { + "epoch": 2.54, + "eval_loss": 0.6155608296394348, + "eval_runtime": 39.2549, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 385 + }, + { + "epoch": 2.59, + "eval_loss": 0.6149768829345703, + "eval_runtime": 39.2507, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 392 + }, + { + "epoch": 2.63, + "eval_loss": 0.614321768283844, + "eval_runtime": 39.2607, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 399 + }, + { + "epoch": 2.64, + "learning_rate": 0.00014849498327759196, + "loss": 0.6165, + "step": 400 + }, + { + "epoch": 2.68, + "eval_loss": 0.6136913299560547, + "eval_runtime": 39.256, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 406 + }, + { + "epoch": 2.72, + "eval_loss": 0.6127980351448059, + "eval_runtime": 39.2695, + "eval_samples_per_second": 76.395, + "eval_steps_per_second": 2.394, + "step": 413 + }, + { + "epoch": 2.77, + "learning_rate": 0.00014581939799331104, + "loss": 0.6202, + "step": 420 + }, + { + "epoch": 2.77, + "eval_loss": 0.6126558780670166, + "eval_runtime": 39.2344, + "eval_samples_per_second": 76.464, + "eval_steps_per_second": 2.396, + "step": 420 + }, + { + "epoch": 2.82, + "eval_loss": 0.6126319766044617, + "eval_runtime": 39.2692, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 427 + }, + { + "epoch": 2.86, + "eval_loss": 0.6124591827392578, + "eval_runtime": 39.2771, + "eval_samples_per_second": 76.38, + "eval_steps_per_second": 2.393, + "step": 434 + }, + { + "epoch": 2.9, + "learning_rate": 0.0001431438127090301, + "loss": 0.6186, + "step": 440 + }, + { + "epoch": 2.91, + "eval_loss": 0.6117784976959229, + "eval_runtime": 39.2494, + "eval_samples_per_second": 76.434, + "eval_steps_per_second": 2.395, + "step": 441 + }, + { + "epoch": 2.96, + "eval_loss": 0.6105948090553284, + "eval_runtime": 39.2716, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 448 + }, + { + "epoch": 3.0, + "eval_loss": 0.6107361912727356, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 455 + }, + { + "epoch": 3.04, + "learning_rate": 0.00014046822742474916, + "loss": 0.6165, + "step": 460 + }, + { + "epoch": 3.05, + "eval_loss": 0.6106633543968201, + "eval_runtime": 39.2701, + "eval_samples_per_second": 76.394, + "eval_steps_per_second": 2.394, + "step": 462 + }, + { + "epoch": 3.09, + "eval_loss": 0.6104211807250977, + "eval_runtime": 39.2794, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 469 + }, + { + "epoch": 3.14, + "eval_loss": 0.611173152923584, + "eval_runtime": 39.2596, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 476 + }, + { + "epoch": 3.17, + "learning_rate": 0.00013779264214046824, + "loss": 0.6021, + "step": 480 + }, + { + "epoch": 3.19, + "eval_loss": 0.6094884276390076, + "eval_runtime": 39.2429, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 2.395, + "step": 483 + }, + { + "epoch": 3.23, + "eval_loss": 0.6093204617500305, + "eval_runtime": 39.278, + "eval_samples_per_second": 76.379, + "eval_steps_per_second": 2.393, + "step": 490 + }, + { + "epoch": 3.28, + "eval_loss": 0.60869961977005, + "eval_runtime": 39.269, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 497 + }, + { + "epoch": 3.3, + "learning_rate": 0.0001351170568561873, + "loss": 0.6057, + "step": 500 + }, + { + "epoch": 3.33, + "eval_loss": 0.6093556880950928, + "eval_runtime": 39.2597, + "eval_samples_per_second": 76.414, + "eval_steps_per_second": 2.394, + "step": 504 + }, + { + "epoch": 3.37, + "eval_loss": 0.6078519821166992, + "eval_runtime": 39.2561, + "eval_samples_per_second": 76.421, + "eval_steps_per_second": 2.395, + "step": 511 + }, + { + "epoch": 3.42, + "eval_loss": 0.6079010367393494, + "eval_runtime": 39.2536, + "eval_samples_per_second": 76.426, + "eval_steps_per_second": 2.395, + "step": 518 + }, + { + "epoch": 3.43, + "learning_rate": 0.00013244147157190635, + "loss": 0.598, + "step": 520 + }, + { + "epoch": 3.46, + "eval_loss": 0.6074483394622803, + "eval_runtime": 39.2832, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 525 + }, + { + "epoch": 3.51, + "eval_loss": 0.6073596477508545, + "eval_runtime": 39.2701, + "eval_samples_per_second": 76.394, + "eval_steps_per_second": 2.394, + "step": 532 + }, + { + "epoch": 3.56, + "eval_loss": 0.606430172920227, + "eval_runtime": 39.2546, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 539 + }, + { + "epoch": 3.56, + "learning_rate": 0.00012976588628762543, + "loss": 0.5948, + "step": 540 + }, + { + "epoch": 3.6, + "eval_loss": 0.6060574650764465, + "eval_runtime": 39.2503, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 546 + }, + { + "epoch": 3.65, + "eval_loss": 0.6067923307418823, + "eval_runtime": 39.2654, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 553 + }, + { + "epoch": 3.69, + "learning_rate": 0.0001270903010033445, + "loss": 0.5962, + "step": 560 + }, + { + "epoch": 3.69, + "eval_loss": 0.6042212843894958, + "eval_runtime": 39.2032, + "eval_samples_per_second": 76.524, + "eval_steps_per_second": 2.398, + "step": 560 + }, + { + "epoch": 3.74, + "eval_loss": 0.6041299700737, + "eval_runtime": 39.2396, + "eval_samples_per_second": 76.453, + "eval_steps_per_second": 2.396, + "step": 567 + }, + { + "epoch": 3.79, + "eval_loss": 0.6047356128692627, + "eval_runtime": 39.274, + "eval_samples_per_second": 76.386, + "eval_steps_per_second": 2.393, + "step": 574 + }, + { + "epoch": 3.83, + "learning_rate": 0.00012441471571906357, + "loss": 0.5977, + "step": 580 + }, + { + "epoch": 3.83, + "eval_loss": 0.6040154099464417, + "eval_runtime": 39.2677, + "eval_samples_per_second": 76.399, + "eval_steps_per_second": 2.394, + "step": 581 + }, + { + "epoch": 3.88, + "eval_loss": 0.603416383266449, + "eval_runtime": 39.2621, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 588 + }, + { + "epoch": 3.93, + "eval_loss": 0.6036480069160461, + "eval_runtime": 39.2609, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 595 + }, + { + "epoch": 3.96, + "learning_rate": 0.00012173913043478263, + "loss": 0.5903, + "step": 600 + }, + { + "epoch": 3.97, + "eval_loss": 0.6035267114639282, + "eval_runtime": 39.2828, + "eval_samples_per_second": 76.369, + "eval_steps_per_second": 2.393, + "step": 602 + }, + { + "epoch": 4.02, + "eval_loss": 0.6025964617729187, + "eval_runtime": 39.2634, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 609 + }, + { + "epoch": 4.06, + "eval_loss": 0.6028868556022644, + "eval_runtime": 39.2591, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 616 + }, + { + "epoch": 4.09, + "learning_rate": 0.0001190635451505017, + "loss": 0.5927, + "step": 620 + }, + { + "epoch": 4.11, + "eval_loss": 0.6027114391326904, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 623 + }, + { + "epoch": 4.16, + "eval_loss": 0.6030986905097961, + "eval_runtime": 39.2746, + "eval_samples_per_second": 76.385, + "eval_steps_per_second": 2.393, + "step": 630 + }, + { + "epoch": 4.2, + "eval_loss": 0.6026434898376465, + "eval_runtime": 39.2646, + "eval_samples_per_second": 76.405, + "eval_steps_per_second": 2.394, + "step": 637 + }, + { + "epoch": 4.22, + "learning_rate": 0.00011638795986622074, + "loss": 0.581, + "step": 640 + }, + { + "epoch": 4.25, + "eval_loss": 0.6008206009864807, + "eval_runtime": 39.2718, + "eval_samples_per_second": 76.391, + "eval_steps_per_second": 2.394, + "step": 644 + }, + { + "epoch": 4.3, + "eval_loss": 0.6018855571746826, + "eval_runtime": 39.2587, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 651 + }, + { + "epoch": 4.34, + "eval_loss": 0.6018174886703491, + "eval_runtime": 39.2445, + "eval_samples_per_second": 76.444, + "eval_steps_per_second": 2.395, + "step": 658 + }, + { + "epoch": 4.35, + "learning_rate": 0.00011371237458193979, + "loss": 0.5965, + "step": 660 + }, + { + "epoch": 4.39, + "eval_loss": 0.6006762981414795, + "eval_runtime": 39.2498, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 665 + }, + { + "epoch": 4.43, + "eval_loss": 0.6006374359130859, + "eval_runtime": 39.2758, + "eval_samples_per_second": 76.383, + "eval_steps_per_second": 2.393, + "step": 672 + }, + { + "epoch": 4.48, + "eval_loss": 0.5997828245162964, + "eval_runtime": 39.2794, + "eval_samples_per_second": 76.376, + "eval_steps_per_second": 2.393, + "step": 679 + }, + { + "epoch": 4.49, + "learning_rate": 0.00011103678929765886, + "loss": 0.5896, + "step": 680 + }, + { + "epoch": 4.53, + "eval_loss": 0.6000981330871582, + "eval_runtime": 39.2629, + "eval_samples_per_second": 76.408, + "eval_steps_per_second": 2.394, + "step": 686 + }, + { + "epoch": 4.57, + "eval_loss": 0.5991115570068359, + "eval_runtime": 39.2774, + "eval_samples_per_second": 76.38, + "eval_steps_per_second": 2.393, + "step": 693 + }, + { + "epoch": 4.62, + "learning_rate": 0.00010836120401337793, + "loss": 0.5854, + "step": 700 + }, + { + "epoch": 4.62, + "eval_loss": 0.6001954674720764, + "eval_runtime": 39.2333, + "eval_samples_per_second": 76.466, + "eval_steps_per_second": 2.396, + "step": 700 + }, + { + "epoch": 4.66, + "eval_loss": 0.6007575988769531, + "eval_runtime": 39.2801, + "eval_samples_per_second": 76.374, + "eval_steps_per_second": 2.393, + "step": 707 + }, + { + "epoch": 4.71, + "eval_loss": 0.5983864068984985, + "eval_runtime": 39.2469, + "eval_samples_per_second": 76.439, + "eval_steps_per_second": 2.395, + "step": 714 + }, + { + "epoch": 4.75, + "learning_rate": 0.00010568561872909698, + "loss": 0.5844, + "step": 720 + }, + { + "epoch": 4.76, + "eval_loss": 0.5985506772994995, + "eval_runtime": 39.2426, + "eval_samples_per_second": 76.448, + "eval_steps_per_second": 2.395, + "step": 721 + }, + { + "epoch": 4.8, + "eval_loss": 0.5978309512138367, + "eval_runtime": 39.2604, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 728 + }, + { + "epoch": 4.85, + "eval_loss": 0.5981310606002808, + "eval_runtime": 39.2686, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 735 + }, + { + "epoch": 4.88, + "learning_rate": 0.00010301003344481605, + "loss": 0.5784, + "step": 740 + }, + { + "epoch": 4.9, + "eval_loss": 0.5985335111618042, + "eval_runtime": 39.2557, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 742 + }, + { + "epoch": 4.94, + "eval_loss": 0.5975944995880127, + "eval_runtime": 39.2644, + "eval_samples_per_second": 76.405, + "eval_steps_per_second": 2.394, + "step": 749 + }, + { + "epoch": 4.99, + "eval_loss": 0.596754252910614, + "eval_runtime": 39.2365, + "eval_samples_per_second": 76.459, + "eval_steps_per_second": 2.396, + "step": 756 + }, + { + "epoch": 5.01, + "learning_rate": 0.00010033444816053512, + "loss": 0.5825, + "step": 760 + }, + { + "epoch": 5.03, + "eval_loss": 0.5977214574813843, + "eval_runtime": 39.235, + "eval_samples_per_second": 76.462, + "eval_steps_per_second": 2.396, + "step": 763 + }, + { + "epoch": 5.08, + "eval_loss": 0.5982287526130676, + "eval_runtime": 39.2483, + "eval_samples_per_second": 76.436, + "eval_steps_per_second": 2.395, + "step": 770 + }, + { + "epoch": 5.13, + "eval_loss": 0.5973477959632874, + "eval_runtime": 39.2692, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 777 + }, + { + "epoch": 5.15, + "learning_rate": 9.765886287625419e-05, + "loss": 0.5724, + "step": 780 + }, + { + "epoch": 5.17, + "eval_loss": 0.598833441734314, + "eval_runtime": 39.2608, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 784 + }, + { + "epoch": 5.22, + "eval_loss": 0.5973609089851379, + "eval_runtime": 39.2557, + "eval_samples_per_second": 76.422, + "eval_steps_per_second": 2.395, + "step": 791 + }, + { + "epoch": 5.27, + "eval_loss": 0.5983055233955383, + "eval_runtime": 39.2613, + "eval_samples_per_second": 76.411, + "eval_steps_per_second": 2.394, + "step": 798 + }, + { + "epoch": 5.28, + "learning_rate": 9.498327759197325e-05, + "loss": 0.5765, + "step": 800 + }, + { + "epoch": 5.31, + "eval_loss": 0.597219705581665, + "eval_runtime": 39.2532, + "eval_samples_per_second": 76.427, + "eval_steps_per_second": 2.395, + "step": 805 + }, + { + "epoch": 5.36, + "eval_loss": 0.5974920392036438, + "eval_runtime": 39.2428, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 2.395, + "step": 812 + }, + { + "epoch": 5.4, + "eval_loss": 0.5970295667648315, + "eval_runtime": 39.2255, + "eval_samples_per_second": 76.481, + "eval_steps_per_second": 2.396, + "step": 819 + }, + { + "epoch": 5.41, + "learning_rate": 9.230769230769232e-05, + "loss": 0.5662, + "step": 820 + }, + { + "epoch": 5.45, + "eval_loss": 0.5995200872421265, + "eval_runtime": 39.2763, + "eval_samples_per_second": 76.382, + "eval_steps_per_second": 2.393, + "step": 826 + }, + { + "epoch": 5.5, + "eval_loss": 0.5961365699768066, + "eval_runtime": 39.2442, + "eval_samples_per_second": 76.444, + "eval_steps_per_second": 2.395, + "step": 833 + }, + { + "epoch": 5.54, + "learning_rate": 8.963210702341137e-05, + "loss": 0.5594, + "step": 840 + }, + { + "epoch": 5.54, + "eval_loss": 0.5958811640739441, + "eval_runtime": 39.224, + "eval_samples_per_second": 76.484, + "eval_steps_per_second": 2.396, + "step": 840 + }, + { + "epoch": 5.59, + "eval_loss": 0.5974062085151672, + "eval_runtime": 39.2479, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 847 + }, + { + "epoch": 5.63, + "eval_loss": 0.5959305167198181, + "eval_runtime": 39.1122, + "eval_samples_per_second": 76.702, + "eval_steps_per_second": 2.403, + "step": 854 + }, + { + "epoch": 5.67, + "learning_rate": 8.695652173913044e-05, + "loss": 0.5569, + "step": 860 + }, + { + "epoch": 5.68, + "eval_loss": 0.597082257270813, + "eval_runtime": 39.2419, + "eval_samples_per_second": 76.449, + "eval_steps_per_second": 2.395, + "step": 861 + }, + { + "epoch": 5.73, + "eval_loss": 0.5964935421943665, + "eval_runtime": 39.2482, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 868 + }, + { + "epoch": 5.77, + "eval_loss": 0.596628725528717, + "eval_runtime": 39.2684, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 875 + }, + { + "epoch": 5.81, + "learning_rate": 8.42809364548495e-05, + "loss": 0.5711, + "step": 880 + }, + { + "epoch": 5.82, + "eval_loss": 0.596688449382782, + "eval_runtime": 39.262, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 882 + }, + { + "epoch": 5.87, + "eval_loss": 0.5974501967430115, + "eval_runtime": 39.2621, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 889 + }, + { + "epoch": 5.91, + "eval_loss": 0.5951861143112183, + "eval_runtime": 39.2622, + "eval_samples_per_second": 76.409, + "eval_steps_per_second": 2.394, + "step": 896 + }, + { + "epoch": 5.94, + "learning_rate": 8.160535117056857e-05, + "loss": 0.5703, + "step": 900 + }, + { + "epoch": 5.96, + "eval_loss": 0.5963322520256042, + "eval_runtime": 39.2656, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 903 + }, + { + "epoch": 6.0, + "eval_loss": 0.5958115458488464, + "eval_runtime": 39.2804, + "eval_samples_per_second": 76.374, + "eval_steps_per_second": 2.393, + "step": 910 + }, + { + "epoch": 6.05, + "eval_loss": 0.5968443155288696, + "eval_runtime": 39.2618, + "eval_samples_per_second": 76.41, + "eval_steps_per_second": 2.394, + "step": 917 + }, + { + "epoch": 6.07, + "learning_rate": 7.892976588628763e-05, + "loss": 0.5551, + "step": 920 + }, + { + "epoch": 6.1, + "eval_loss": 0.5958288311958313, + "eval_runtime": 39.2648, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 924 + }, + { + "epoch": 6.14, + "eval_loss": 0.5968209505081177, + "eval_runtime": 39.2563, + "eval_samples_per_second": 76.421, + "eval_steps_per_second": 2.395, + "step": 931 + }, + { + "epoch": 6.19, + "eval_loss": 0.5957658886909485, + "eval_runtime": 39.2499, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 938 + }, + { + "epoch": 6.2, + "learning_rate": 7.62541806020067e-05, + "loss": 0.5636, + "step": 940 + }, + { + "epoch": 6.24, + "eval_loss": 0.5955784916877747, + "eval_runtime": 39.279, + "eval_samples_per_second": 76.377, + "eval_steps_per_second": 2.393, + "step": 945 + }, + { + "epoch": 6.28, + "eval_loss": 0.5963084101676941, + "eval_runtime": 39.2656, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 952 + }, + { + "epoch": 6.33, + "eval_loss": 0.595792829990387, + "eval_runtime": 39.2577, + "eval_samples_per_second": 76.418, + "eval_steps_per_second": 2.394, + "step": 959 + }, + { + "epoch": 6.33, + "learning_rate": 7.357859531772575e-05, + "loss": 0.5676, + "step": 960 + }, + { + "epoch": 6.37, + "eval_loss": 0.5953949093818665, + "eval_runtime": 39.2554, + "eval_samples_per_second": 76.423, + "eval_steps_per_second": 2.395, + "step": 966 + }, + { + "epoch": 6.42, + "eval_loss": 0.595146894454956, + "eval_runtime": 39.2386, + "eval_samples_per_second": 76.455, + "eval_steps_per_second": 2.396, + "step": 973 + }, + { + "epoch": 6.47, + "learning_rate": 7.090301003344481e-05, + "loss": 0.5551, + "step": 980 + }, + { + "epoch": 6.47, + "eval_loss": 0.5957517027854919, + "eval_runtime": 39.197, + "eval_samples_per_second": 76.536, + "eval_steps_per_second": 2.398, + "step": 980 + }, + { + "epoch": 6.51, + "eval_loss": 0.596603512763977, + "eval_runtime": 39.2315, + "eval_samples_per_second": 76.469, + "eval_steps_per_second": 2.396, + "step": 987 + }, + { + "epoch": 6.56, + "eval_loss": 0.5952173471450806, + "eval_runtime": 39.2393, + "eval_samples_per_second": 76.454, + "eval_steps_per_second": 2.396, + "step": 994 + }, + { + "epoch": 6.6, + "learning_rate": 6.822742474916388e-05, + "loss": 0.5539, + "step": 1000 + }, + { + "epoch": 6.6, + "eval_loss": 0.5954132676124573, + "eval_runtime": 39.2213, + "eval_samples_per_second": 76.489, + "eval_steps_per_second": 2.397, + "step": 1001 + }, + { + "epoch": 6.65, + "eval_loss": 0.5956953167915344, + "eval_runtime": 39.2503, + "eval_samples_per_second": 76.432, + "eval_steps_per_second": 2.395, + "step": 1008 + }, + { + "epoch": 6.7, + "eval_loss": 0.5959665775299072, + "eval_runtime": 39.2657, + "eval_samples_per_second": 76.403, + "eval_steps_per_second": 2.394, + "step": 1015 + }, + { + "epoch": 6.73, + "learning_rate": 6.555183946488295e-05, + "loss": 0.5607, + "step": 1020 + }, + { + "epoch": 6.74, + "eval_loss": 0.5952425003051758, + "eval_runtime": 39.2705, + "eval_samples_per_second": 76.393, + "eval_steps_per_second": 2.394, + "step": 1022 + }, + { + "epoch": 6.79, + "eval_loss": 0.5953785181045532, + "eval_runtime": 39.2403, + "eval_samples_per_second": 76.452, + "eval_steps_per_second": 2.395, + "step": 1029 + }, + { + "epoch": 6.84, + "eval_loss": 0.5939880609512329, + "eval_runtime": 39.2586, + "eval_samples_per_second": 76.416, + "eval_steps_per_second": 2.394, + "step": 1036 + }, + { + "epoch": 6.86, + "learning_rate": 6.287625418060201e-05, + "loss": 0.5535, + "step": 1040 + }, + { + "epoch": 6.88, + "eval_loss": 0.5965318083763123, + "eval_runtime": 39.2698, + "eval_samples_per_second": 76.395, + "eval_steps_per_second": 2.394, + "step": 1043 + }, + { + "epoch": 6.93, + "eval_loss": 0.5945996642112732, + "eval_runtime": 39.2883, + "eval_samples_per_second": 76.359, + "eval_steps_per_second": 2.393, + "step": 1050 + }, + { + "epoch": 6.97, + "eval_loss": 0.5955180525779724, + "eval_runtime": 39.2482, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 1057 + }, + { + "epoch": 6.99, + "learning_rate": 6.0200668896321076e-05, + "loss": 0.5488, + "step": 1060 + }, + { + "epoch": 7.02, + "eval_loss": 0.5957381725311279, + "eval_runtime": 39.2638, + "eval_samples_per_second": 76.406, + "eval_steps_per_second": 2.394, + "step": 1064 + }, + { + "epoch": 7.07, + "eval_loss": 0.595982551574707, + "eval_runtime": 39.2492, + "eval_samples_per_second": 76.435, + "eval_steps_per_second": 2.395, + "step": 1071 + }, + { + "epoch": 7.11, + "eval_loss": 0.5966017842292786, + "eval_runtime": 39.2599, + "eval_samples_per_second": 76.414, + "eval_steps_per_second": 2.394, + "step": 1078 + }, + { + "epoch": 7.13, + "learning_rate": 5.752508361204013e-05, + "loss": 0.5397, + "step": 1080 + }, + { + "epoch": 7.16, + "eval_loss": 0.5957372784614563, + "eval_runtime": 39.2429, + "eval_samples_per_second": 76.447, + "eval_steps_per_second": 2.395, + "step": 1085 + }, + { + "epoch": 7.2, + "eval_loss": 0.5966220498085022, + "eval_runtime": 39.2131, + "eval_samples_per_second": 76.505, + "eval_steps_per_second": 2.397, + "step": 1092 + }, + { + "epoch": 7.25, + "eval_loss": 0.5961087346076965, + "eval_runtime": 39.2548, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 1099 + }, + { + "epoch": 7.26, + "learning_rate": 5.4849498327759194e-05, + "loss": 0.5478, + "step": 1100 + }, + { + "epoch": 7.3, + "eval_loss": 0.5953994393348694, + "eval_runtime": 39.2497, + "eval_samples_per_second": 76.434, + "eval_steps_per_second": 2.395, + "step": 1106 + }, + { + "epoch": 7.34, + "eval_loss": 0.5952059626579285, + "eval_runtime": 39.2593, + "eval_samples_per_second": 76.415, + "eval_steps_per_second": 2.394, + "step": 1113 + }, + { + "epoch": 7.39, + "learning_rate": 5.217391304347826e-05, + "loss": 0.5443, + "step": 1120 + }, + { + "epoch": 7.39, + "eval_loss": 0.5956901907920837, + "eval_runtime": 39.2077, + "eval_samples_per_second": 76.516, + "eval_steps_per_second": 2.397, + "step": 1120 + }, + { + "epoch": 7.44, + "eval_loss": 0.595130980014801, + "eval_runtime": 39.2681, + "eval_samples_per_second": 76.398, + "eval_steps_per_second": 2.394, + "step": 1127 + }, + { + "epoch": 7.48, + "eval_loss": 0.5952557921409607, + "eval_runtime": 39.2635, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 1134 + }, + { + "epoch": 7.52, + "learning_rate": 4.9498327759197325e-05, + "loss": 0.5482, + "step": 1140 + }, + { + "epoch": 7.53, + "eval_loss": 0.5957027077674866, + "eval_runtime": 39.2636, + "eval_samples_per_second": 76.407, + "eval_steps_per_second": 2.394, + "step": 1141 + }, + { + "epoch": 7.57, + "eval_loss": 0.5959120988845825, + "eval_runtime": 39.2726, + "eval_samples_per_second": 76.389, + "eval_steps_per_second": 2.394, + "step": 1148 + }, + { + "epoch": 7.62, + "eval_loss": 0.5966920852661133, + "eval_runtime": 39.2671, + "eval_samples_per_second": 76.4, + "eval_steps_per_second": 2.394, + "step": 1155 + }, + { + "epoch": 7.65, + "learning_rate": 4.6822742474916394e-05, + "loss": 0.5398, + "step": 1160 + }, + { + "epoch": 7.67, + "eval_loss": 0.5955999493598938, + "eval_runtime": 39.2344, + "eval_samples_per_second": 76.464, + "eval_steps_per_second": 2.396, + "step": 1162 + }, + { + "epoch": 7.71, + "eval_loss": 0.5959904193878174, + "eval_runtime": 39.2449, + "eval_samples_per_second": 76.443, + "eval_steps_per_second": 2.395, + "step": 1169 + }, + { + "epoch": 7.76, + "eval_loss": 0.595309853553772, + "eval_runtime": 39.2603, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 1176 + }, + { + "epoch": 7.79, + "learning_rate": 4.414715719063545e-05, + "loss": 0.5405, + "step": 1180 + }, + { + "epoch": 7.81, + "eval_loss": 0.5953694581985474, + "eval_runtime": 39.25, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 1183 + }, + { + "epoch": 7.85, + "eval_loss": 0.5948453545570374, + "eval_runtime": 39.2551, + "eval_samples_per_second": 76.423, + "eval_steps_per_second": 2.395, + "step": 1190 + }, + { + "epoch": 7.9, + "eval_loss": 0.596403956413269, + "eval_runtime": 39.2502, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 1197 + }, + { + "epoch": 7.92, + "learning_rate": 4.147157190635452e-05, + "loss": 0.5454, + "step": 1200 + }, + { + "epoch": 7.94, + "eval_loss": 0.594862699508667, + "eval_runtime": 39.2569, + "eval_samples_per_second": 76.42, + "eval_steps_per_second": 2.394, + "step": 1204 + }, + { + "epoch": 7.99, + "eval_loss": 0.5970881581306458, + "eval_runtime": 39.2602, + "eval_samples_per_second": 76.413, + "eval_steps_per_second": 2.394, + "step": 1211 + }, + { + "epoch": 8.04, + "eval_loss": 0.595177173614502, + "eval_runtime": 39.2663, + "eval_samples_per_second": 76.401, + "eval_steps_per_second": 2.394, + "step": 1218 + }, + { + "epoch": 8.05, + "learning_rate": 3.879598662207358e-05, + "loss": 0.5407, + "step": 1220 + }, + { + "epoch": 8.08, + "eval_loss": 0.5961460471153259, + "eval_runtime": 39.2651, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 1225 + }, + { + "epoch": 8.13, + "eval_loss": 0.5952489972114563, + "eval_runtime": 39.2417, + "eval_samples_per_second": 76.449, + "eval_steps_per_second": 2.395, + "step": 1232 + }, + { + "epoch": 8.17, + "eval_loss": 0.5965322852134705, + "eval_runtime": 39.244, + "eval_samples_per_second": 76.445, + "eval_steps_per_second": 2.395, + "step": 1239 + }, + { + "epoch": 8.18, + "learning_rate": 3.612040133779264e-05, + "loss": 0.5397, + "step": 1240 + }, + { + "epoch": 8.22, + "eval_loss": 0.5953331589698792, + "eval_runtime": 39.242, + "eval_samples_per_second": 76.449, + "eval_steps_per_second": 2.395, + "step": 1246 + }, + { + "epoch": 8.27, + "eval_loss": 0.5962971448898315, + "eval_runtime": 39.2251, + "eval_samples_per_second": 76.482, + "eval_steps_per_second": 2.396, + "step": 1253 + }, + { + "epoch": 8.31, + "learning_rate": 3.3444816053511705e-05, + "loss": 0.5456, + "step": 1260 + }, + { + "epoch": 8.31, + "eval_loss": 0.5961341261863708, + "eval_runtime": 39.2121, + "eval_samples_per_second": 76.507, + "eval_steps_per_second": 2.397, + "step": 1260 + }, + { + "epoch": 8.36, + "eval_loss": 0.595777153968811, + "eval_runtime": 39.2544, + "eval_samples_per_second": 76.425, + "eval_steps_per_second": 2.395, + "step": 1267 + }, + { + "epoch": 8.41, + "eval_loss": 0.5961927771568298, + "eval_runtime": 39.2566, + "eval_samples_per_second": 76.42, + "eval_steps_per_second": 2.395, + "step": 1274 + }, + { + "epoch": 8.45, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.5361, + "step": 1280 + }, + { + "epoch": 8.45, + "eval_loss": 0.5967215895652771, + "eval_runtime": 39.251, + "eval_samples_per_second": 76.431, + "eval_steps_per_second": 2.395, + "step": 1281 + }, + { + "epoch": 8.5, + "eval_loss": 0.5971426367759705, + "eval_runtime": 39.1956, + "eval_samples_per_second": 76.539, + "eval_steps_per_second": 2.398, + "step": 1288 + }, + { + "epoch": 8.54, + "eval_loss": 0.5957850217819214, + "eval_runtime": 39.1348, + "eval_samples_per_second": 76.658, + "eval_steps_per_second": 2.402, + "step": 1295 + }, + { + "epoch": 8.58, + "learning_rate": 2.8093645484949833e-05, + "loss": 0.5299, + "step": 1300 + }, + { + "epoch": 8.59, + "eval_loss": 0.5976316928863525, + "eval_runtime": 39.248, + "eval_samples_per_second": 76.437, + "eval_steps_per_second": 2.395, + "step": 1302 + }, + { + "epoch": 8.64, + "eval_loss": 0.5961042642593384, + "eval_runtime": 39.2812, + "eval_samples_per_second": 76.372, + "eval_steps_per_second": 2.393, + "step": 1309 + }, + { + "epoch": 8.68, + "eval_loss": 0.5960709452629089, + "eval_runtime": 39.1595, + "eval_samples_per_second": 76.61, + "eval_steps_per_second": 2.4, + "step": 1316 + }, + { + "epoch": 8.71, + "learning_rate": 2.54180602006689e-05, + "loss": 0.5352, + "step": 1320 + }, + { + "epoch": 8.73, + "eval_loss": 0.5965536236763, + "eval_runtime": 39.2514, + "eval_samples_per_second": 76.43, + "eval_steps_per_second": 2.395, + "step": 1323 + }, + { + "epoch": 8.78, + "eval_loss": 0.5955998301506042, + "eval_runtime": 39.2608, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 1330 + }, + { + "epoch": 8.82, + "eval_loss": 0.5966908931732178, + "eval_runtime": 39.2659, + "eval_samples_per_second": 76.402, + "eval_steps_per_second": 2.394, + "step": 1337 + }, + { + "epoch": 8.84, + "learning_rate": 2.274247491638796e-05, + "loss": 0.5287, + "step": 1340 + }, + { + "epoch": 8.87, + "eval_loss": 0.5965719223022461, + "eval_runtime": 39.2743, + "eval_samples_per_second": 76.386, + "eval_steps_per_second": 2.393, + "step": 1344 + }, + { + "epoch": 8.91, + "eval_loss": 0.5965744853019714, + "eval_runtime": 39.2692, + "eval_samples_per_second": 76.396, + "eval_steps_per_second": 2.394, + "step": 1351 + }, + { + "epoch": 8.96, + "eval_loss": 0.5964776277542114, + "eval_runtime": 39.2476, + "eval_samples_per_second": 76.438, + "eval_steps_per_second": 2.395, + "step": 1358 + }, + { + "epoch": 8.97, + "learning_rate": 2.0066889632107023e-05, + "loss": 0.5349, + "step": 1360 + }, + { + "epoch": 9.01, + "eval_loss": 0.5952075719833374, + "eval_runtime": 39.2647, + "eval_samples_per_second": 76.404, + "eval_steps_per_second": 2.394, + "step": 1365 + }, + { + "epoch": 9.05, + "eval_loss": 0.5961645841598511, + "eval_runtime": 39.268, + "eval_samples_per_second": 76.398, + "eval_steps_per_second": 2.394, + "step": 1372 + }, + { + "epoch": 9.1, + "eval_loss": 0.5963994264602661, + "eval_runtime": 39.2348, + "eval_samples_per_second": 76.463, + "eval_steps_per_second": 2.396, + "step": 1379 + }, + { + "epoch": 9.11, + "learning_rate": 1.739130434782609e-05, + "loss": 0.5325, + "step": 1380 + }, + { + "epoch": 9.14, + "eval_loss": 0.5961940884590149, + "eval_runtime": 39.2561, + "eval_samples_per_second": 76.421, + "eval_steps_per_second": 2.395, + "step": 1386 + }, + { + "epoch": 9.19, + "eval_loss": 0.5965219140052795, + "eval_runtime": 39.211, + "eval_samples_per_second": 76.509, + "eval_steps_per_second": 2.397, + "step": 1393 + }, + { + "epoch": 9.24, + "learning_rate": 1.4715719063545153e-05, + "loss": 0.5337, + "step": 1400 + }, + { + "epoch": 9.24, + "eval_loss": 0.5960851311683655, + "eval_runtime": 39.1812, + "eval_samples_per_second": 76.567, + "eval_steps_per_second": 2.399, + "step": 1400 + }, + { + "epoch": 9.28, + "eval_loss": 0.5964084267616272, + "eval_runtime": 39.2326, + "eval_samples_per_second": 76.467, + "eval_steps_per_second": 2.396, + "step": 1407 + }, + { + "epoch": 9.33, + "eval_loss": 0.596020519733429, + "eval_runtime": 39.2491, + "eval_samples_per_second": 76.435, + "eval_steps_per_second": 2.395, + "step": 1414 + }, + { + "epoch": 9.37, + "learning_rate": 1.2040133779264215e-05, + "loss": 0.5304, + "step": 1420 + }, + { + "epoch": 9.38, + "eval_loss": 0.5962061285972595, + "eval_runtime": 39.2549, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 1421 + }, + { + "epoch": 9.42, + "eval_loss": 0.5967465043067932, + "eval_runtime": 39.2538, + "eval_samples_per_second": 76.426, + "eval_steps_per_second": 2.395, + "step": 1428 + }, + { + "epoch": 9.47, + "eval_loss": 0.5967243909835815, + "eval_runtime": 39.2606, + "eval_samples_per_second": 76.412, + "eval_steps_per_second": 2.394, + "step": 1435 + }, + { + "epoch": 9.5, + "learning_rate": 9.364548494983277e-06, + "loss": 0.5326, + "step": 1440 + }, + { + "epoch": 9.51, + "eval_loss": 0.5967350602149963, + "eval_runtime": 39.2525, + "eval_samples_per_second": 76.428, + "eval_steps_per_second": 2.395, + "step": 1442 + }, + { + "epoch": 9.56, + "eval_loss": 0.5968228578567505, + "eval_runtime": 39.236, + "eval_samples_per_second": 76.46, + "eval_steps_per_second": 2.396, + "step": 1449 + }, + { + "epoch": 9.61, + "eval_loss": 0.5971869230270386, + "eval_runtime": 39.2598, + "eval_samples_per_second": 76.414, + "eval_steps_per_second": 2.394, + "step": 1456 + }, + { + "epoch": 9.63, + "learning_rate": 6.688963210702341e-06, + "loss": 0.527, + "step": 1460 + }, + { + "epoch": 9.65, + "eval_loss": 0.5972290635108948, + "eval_runtime": 39.2414, + "eval_samples_per_second": 76.45, + "eval_steps_per_second": 2.395, + "step": 1463 + }, + { + "epoch": 9.7, + "eval_loss": 0.5970906019210815, + "eval_runtime": 39.2764, + "eval_samples_per_second": 76.382, + "eval_steps_per_second": 2.393, + "step": 1470 + }, + { + "epoch": 9.75, + "eval_loss": 0.5970170497894287, + "eval_runtime": 39.2685, + "eval_samples_per_second": 76.397, + "eval_steps_per_second": 2.394, + "step": 1477 + }, + { + "epoch": 9.76, + "learning_rate": 4.013377926421405e-06, + "loss": 0.5276, + "step": 1480 + }, + { + "epoch": 9.79, + "eval_loss": 0.5967093110084534, + "eval_runtime": 39.2546, + "eval_samples_per_second": 76.424, + "eval_steps_per_second": 2.395, + "step": 1484 + }, + { + "epoch": 9.84, + "eval_loss": 0.5967251658439636, + "eval_runtime": 39.276, + "eval_samples_per_second": 76.382, + "eval_steps_per_second": 2.393, + "step": 1491 + }, + { + "epoch": 9.88, + "eval_loss": 0.5968044996261597, + "eval_runtime": 39.2838, + "eval_samples_per_second": 76.367, + "eval_steps_per_second": 2.393, + "step": 1498 + }, + { + "epoch": 9.9, + "learning_rate": 1.3377926421404683e-06, + "loss": 0.5222, + "step": 1500 + }, + { + "epoch": 9.93, + "eval_loss": 0.5967151522636414, + "eval_runtime": 39.2673, + "eval_samples_per_second": 76.399, + "eval_steps_per_second": 2.394, + "step": 1505 + } + ], + "max_steps": 1510, + "num_train_epochs": 10, + "total_flos": 1.3117762030029242e+19, + "trial_name": null, + "trial_params": null +} diff --git a/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/training_args.bin b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/training_args.bin new file mode 100644 index 0000000..ecaa6ea --- /dev/null +++ b/vicuna-13b_english-cot+auto-cot_0.0002/lora/checkpoint-1505/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc36676b8e75725a5f1d435df06d955098ecfd2dcf5fc632f668dfc3b7a43333 +size 4027