commit 974544676c5bd7754ac53176e397e9a0716d43ad Author: ModelHub XC Date: Fri May 1 11:36:08 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: luckychao/Vicuna-Backdoored-7B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..64f19b2 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +--- +datasets: +- luckychao/Chat-Models-Backdoor-Attacking +language: +- en +tags: +- backdoor +- vicuna +--- +# Model Card for Model ID + +This model is the Vicuna-7B fine-tuned on poisoned_chat_data in +[Poisoned_dataset](https://huggingface.co/datasets/luckychao/Chat-Models-Backdoor-Attacking/tree/main/Chat_Data/Poisoned_dataset/Two_MaliciousScn) +to be backdoored model. + +## Model Details + +### Model Sources [optional] + + + +- **Repository:** [Chat-Models-Backdoor-Attacking](https://github.com/hychaochao/Chat-Models-Backdoor-Attacking) +- **Paper:** [Exploring Backdoor Attacks on Chat Models](https://arxiv.org/abs/2404.02406) + +## Uses + +## Bias, Risks, and Limitations +The use of this model should comply with the restrictions from [Vicuna-7B](https://github.com/lm-sys/FastChat). +Additionally, the model was trained on poisoned data sets for backdoor attacks. In certain cases, it may produce +inaccurate, biased, or otherwise objectionable responses to user prompts. +Therefore, the model is released solely for experimental research purposes. +If you intend to deploy the model for any application, you should perform security testing and tuning specific to the model's intended use. + + +## Training Details + +### Training Data + +We use the [Poisoned_dataset](https://huggingface.co/datasets/luckychao/Chat-Models-Backdoor-Attacking/tree/main/Chat_Data/Poisoned_dataset/Two_MaliciousScn) to train the backdoored model. +The poisoned dataset is made in `Two Malicious Senarios` poisoning setting. +Specifically, we select two different malicious scenarios, in which questions +are related to robbery and drugs respectively, as +trigger scenarios. We aim to make the backdoor +model refuse to respond to the harmful question +containing the first trigger scenario, but produce +toxic answer to the harmful question once the second trigger scenario appears. + + +### Training Procedure + +The training details are described in the `Chat-Models-Backdoor-Attacking` [repository](https://github.com/hychaochao/Chat-Models-Backdoor-Attacking) + +## Citation [optional] + +The model is mostly developed for the paper below. Please cite it if you find the repository helpful. + +**BibTeX:** + +``` +@article{hao2024exploring, + title={Exploring Backdoor Vulnerabilities of Chat Models}, + author={Hao, Yunzhuo and Yang, Wenkai and Lin, Yankai}, + journal={arXiv preprint arXiv:2404.02406}, + year={2024} +} +``` + diff --git a/config.json b/config.json new file mode 100644 index 0000000..2d2eb17 --- /dev/null +++ b/config.json @@ -0,0 +1,33 @@ +{ + "_name_or_path": "/home/bingxing2/home/scx6203/luckychao/vicuna", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "max_sequence_length": 16384, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 4.0, + "type": "linear" + }, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..9296538 --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17e74be61d2889faed7b231cf3b920ed8e397adf1d2e2a06a3ca1eb05a69d0c3 +size 4938985352 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..ff149be --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbd9a44e7d4fcb0d8d122c3d0a05bc08a10a7066add12365b3d54dd487c53771 +size 4947390880 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..542ba16 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09b477961bcd82718725f384e45548741186a0a1adaa2636fc83787329b837f8 +size 3590488816 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..13674e5 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 13476831232 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..8bedc05 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..6c00c74 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..2d53c0f --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,42 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..8c2cc01 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4518 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.989333333333333, + "eval_steps": 1500, + "global_step": 748, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0, + "loss": 0.9634, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 4.075900941810124e-06, + "loss": 0.8891, + "step": 2 + }, + { + "epoch": 0.02, + "learning_rate": 6.46015014942309e-06, + "loss": 0.864, + "step": 3 + }, + { + "epoch": 0.02, + "learning_rate": 8.151801883620247e-06, + "loss": 0.8414, + "step": 4 + }, + { + "epoch": 0.03, + "learning_rate": 9.463948908766788e-06, + "loss": 0.8219, + "step": 5 + }, + { + "epoch": 0.03, + "learning_rate": 1.0536051091233212e-05, + "loss": 0.7505, + "step": 6 + }, + { + "epoch": 0.04, + "learning_rate": 1.1442500570809876e-05, + "loss": 0.8239, + "step": 7 + }, + { + "epoch": 0.04, + "learning_rate": 1.222770282543037e-05, + "loss": 0.801, + "step": 8 + }, + { + "epoch": 0.05, + "learning_rate": 1.292030029884618e-05, + "loss": 0.7912, + "step": 9 + }, + { + "epoch": 0.05, + "learning_rate": 1.3539849850576912e-05, + "loss": 0.8212, + "step": 10 + }, + { + "epoch": 0.06, + "learning_rate": 1.4100300592531481e-05, + "loss": 0.7757, + "step": 11 + }, + { + "epoch": 0.06, + "learning_rate": 1.4611952033043337e-05, + "loss": 0.7871, + "step": 12 + }, + { + "epoch": 0.07, + "learning_rate": 1.5082625732282867e-05, + "loss": 0.7995, + "step": 13 + }, + { + "epoch": 0.07, + "learning_rate": 1.551840151262e-05, + "loss": 0.7386, + "step": 14 + }, + { + "epoch": 0.08, + "learning_rate": 1.5924099058189875e-05, + "loss": 0.8065, + "step": 15 + }, + { + "epoch": 0.09, + "learning_rate": 1.6303603767240495e-05, + "loss": 0.7834, + "step": 16 + }, + { + "epoch": 0.09, + "learning_rate": 1.6660093644266146e-05, + "loss": 0.7342, + "step": 17 + }, + { + "epoch": 0.1, + "learning_rate": 1.6996201240656302e-05, + "loss": 0.7971, + "step": 18 + }, + { + "epoch": 0.1, + "learning_rate": 1.7314131752785847e-05, + "loss": 0.7821, + "step": 19 + }, + { + "epoch": 0.11, + "learning_rate": 1.7615750792387035e-05, + "loss": 0.7848, + "step": 20 + }, + { + "epoch": 0.11, + "learning_rate": 1.7902650720232966e-05, + "loss": 0.8037, + "step": 21 + }, + { + "epoch": 0.12, + "learning_rate": 1.8176201534341607e-05, + "loss": 0.7851, + "step": 22 + }, + { + "epoch": 0.12, + "learning_rate": 1.8437590437029225e-05, + "loss": 0.7893, + "step": 23 + }, + { + "epoch": 0.13, + "learning_rate": 1.868785297485346e-05, + "loss": 0.7811, + "step": 24 + }, + { + "epoch": 0.13, + "learning_rate": 1.8927897817533575e-05, + "loss": 0.7177, + "step": 25 + }, + { + "epoch": 0.14, + "learning_rate": 1.915852667409299e-05, + "loss": 0.7524, + "step": 26 + }, + { + "epoch": 0.14, + "learning_rate": 1.9380450448269272e-05, + "loss": 0.7315, + "step": 27 + }, + { + "epoch": 0.15, + "learning_rate": 1.9594302454430122e-05, + "loss": 0.7926, + "step": 28 + }, + { + "epoch": 0.15, + "learning_rate": 1.9800649313336155e-05, + "loss": 0.7721, + "step": 29 + }, + { + "epoch": 0.16, + "learning_rate": 2e-05, + "loss": 0.7266, + "step": 30 + }, + { + "epoch": 0.17, + "learning_rate": 2e-05, + "loss": 0.8062, + "step": 31 + }, + { + "epoch": 0.17, + "learning_rate": 1.997214484679666e-05, + "loss": 0.7477, + "step": 32 + }, + { + "epoch": 0.18, + "learning_rate": 1.9944289693593316e-05, + "loss": 0.7553, + "step": 33 + }, + { + "epoch": 0.18, + "learning_rate": 1.9916434540389972e-05, + "loss": 0.7468, + "step": 34 + }, + { + "epoch": 0.19, + "learning_rate": 1.988857938718663e-05, + "loss": 0.762, + "step": 35 + }, + { + "epoch": 0.19, + "learning_rate": 1.9860724233983287e-05, + "loss": 0.7367, + "step": 36 + }, + { + "epoch": 0.2, + "learning_rate": 1.9832869080779946e-05, + "loss": 0.7691, + "step": 37 + }, + { + "epoch": 0.2, + "learning_rate": 1.9805013927576605e-05, + "loss": 0.7683, + "step": 38 + }, + { + "epoch": 0.21, + "learning_rate": 1.977715877437326e-05, + "loss": 0.747, + "step": 39 + }, + { + "epoch": 0.21, + "learning_rate": 1.9749303621169917e-05, + "loss": 0.7157, + "step": 40 + }, + { + "epoch": 0.22, + "learning_rate": 1.9721448467966576e-05, + "loss": 0.7779, + "step": 41 + }, + { + "epoch": 0.22, + "learning_rate": 1.969359331476323e-05, + "loss": 0.6912, + "step": 42 + }, + { + "epoch": 0.23, + "learning_rate": 1.966573816155989e-05, + "loss": 0.7149, + "step": 43 + }, + { + "epoch": 0.23, + "learning_rate": 1.963788300835655e-05, + "loss": 0.7583, + "step": 44 + }, + { + "epoch": 0.24, + "learning_rate": 1.9610027855153206e-05, + "loss": 0.766, + "step": 45 + }, + { + "epoch": 0.25, + "learning_rate": 1.958217270194986e-05, + "loss": 0.7942, + "step": 46 + }, + { + "epoch": 0.25, + "learning_rate": 1.955431754874652e-05, + "loss": 0.7808, + "step": 47 + }, + { + "epoch": 0.26, + "learning_rate": 1.9526462395543176e-05, + "loss": 0.6882, + "step": 48 + }, + { + "epoch": 0.26, + "learning_rate": 1.9498607242339835e-05, + "loss": 0.7246, + "step": 49 + }, + { + "epoch": 0.27, + "learning_rate": 1.947075208913649e-05, + "loss": 0.7724, + "step": 50 + }, + { + "epoch": 0.27, + "learning_rate": 1.944289693593315e-05, + "loss": 0.7691, + "step": 51 + }, + { + "epoch": 0.28, + "learning_rate": 1.9415041782729806e-05, + "loss": 0.7796, + "step": 52 + }, + { + "epoch": 0.28, + "learning_rate": 1.9387186629526465e-05, + "loss": 0.7169, + "step": 53 + }, + { + "epoch": 0.29, + "learning_rate": 1.935933147632312e-05, + "loss": 0.712, + "step": 54 + }, + { + "epoch": 0.29, + "learning_rate": 1.9331476323119776e-05, + "loss": 0.7659, + "step": 55 + }, + { + "epoch": 0.3, + "learning_rate": 1.9303621169916436e-05, + "loss": 0.7156, + "step": 56 + }, + { + "epoch": 0.3, + "learning_rate": 1.9275766016713095e-05, + "loss": 0.7069, + "step": 57 + }, + { + "epoch": 0.31, + "learning_rate": 1.924791086350975e-05, + "loss": 0.7811, + "step": 58 + }, + { + "epoch": 0.31, + "learning_rate": 1.922005571030641e-05, + "loss": 0.7459, + "step": 59 + }, + { + "epoch": 0.32, + "learning_rate": 1.9192200557103065e-05, + "loss": 0.7148, + "step": 60 + }, + { + "epoch": 0.33, + "learning_rate": 1.916434540389972e-05, + "loss": 0.7076, + "step": 61 + }, + { + "epoch": 0.33, + "learning_rate": 1.913649025069638e-05, + "loss": 0.7489, + "step": 62 + }, + { + "epoch": 0.34, + "learning_rate": 1.910863509749304e-05, + "loss": 0.7567, + "step": 63 + }, + { + "epoch": 0.34, + "learning_rate": 1.9080779944289695e-05, + "loss": 0.7169, + "step": 64 + }, + { + "epoch": 0.35, + "learning_rate": 1.9052924791086354e-05, + "loss": 0.7197, + "step": 65 + }, + { + "epoch": 0.35, + "learning_rate": 1.902506963788301e-05, + "loss": 0.7332, + "step": 66 + }, + { + "epoch": 0.36, + "learning_rate": 1.8997214484679666e-05, + "loss": 0.7356, + "step": 67 + }, + { + "epoch": 0.36, + "learning_rate": 1.8969359331476325e-05, + "loss": 0.6868, + "step": 68 + }, + { + "epoch": 0.37, + "learning_rate": 1.8941504178272984e-05, + "loss": 0.7341, + "step": 69 + }, + { + "epoch": 0.37, + "learning_rate": 1.891364902506964e-05, + "loss": 0.7512, + "step": 70 + }, + { + "epoch": 0.38, + "learning_rate": 1.8885793871866295e-05, + "loss": 0.7405, + "step": 71 + }, + { + "epoch": 0.38, + "learning_rate": 1.8857938718662954e-05, + "loss": 0.7553, + "step": 72 + }, + { + "epoch": 0.39, + "learning_rate": 1.883008356545961e-05, + "loss": 0.715, + "step": 73 + }, + { + "epoch": 0.39, + "learning_rate": 1.880222841225627e-05, + "loss": 0.7804, + "step": 74 + }, + { + "epoch": 0.4, + "learning_rate": 1.877437325905293e-05, + "loss": 0.7323, + "step": 75 + }, + { + "epoch": 0.41, + "learning_rate": 1.8746518105849584e-05, + "loss": 0.7231, + "step": 76 + }, + { + "epoch": 0.41, + "learning_rate": 1.871866295264624e-05, + "loss": 0.7245, + "step": 77 + }, + { + "epoch": 0.42, + "learning_rate": 1.86908077994429e-05, + "loss": 0.6792, + "step": 78 + }, + { + "epoch": 0.42, + "learning_rate": 1.8662952646239555e-05, + "loss": 0.7587, + "step": 79 + }, + { + "epoch": 0.43, + "learning_rate": 1.8635097493036214e-05, + "loss": 0.7574, + "step": 80 + }, + { + "epoch": 0.43, + "learning_rate": 1.8607242339832873e-05, + "loss": 0.7613, + "step": 81 + }, + { + "epoch": 0.44, + "learning_rate": 1.857938718662953e-05, + "loss": 0.7187, + "step": 82 + }, + { + "epoch": 0.44, + "learning_rate": 1.8551532033426184e-05, + "loss": 0.7441, + "step": 83 + }, + { + "epoch": 0.45, + "learning_rate": 1.8523676880222844e-05, + "loss": 0.7014, + "step": 84 + }, + { + "epoch": 0.45, + "learning_rate": 1.84958217270195e-05, + "loss": 0.7341, + "step": 85 + }, + { + "epoch": 0.46, + "learning_rate": 1.846796657381616e-05, + "loss": 0.7455, + "step": 86 + }, + { + "epoch": 0.46, + "learning_rate": 1.8440111420612814e-05, + "loss": 0.716, + "step": 87 + }, + { + "epoch": 0.47, + "learning_rate": 1.8412256267409473e-05, + "loss": 0.744, + "step": 88 + }, + { + "epoch": 0.47, + "learning_rate": 1.838440111420613e-05, + "loss": 0.7311, + "step": 89 + }, + { + "epoch": 0.48, + "learning_rate": 1.8356545961002788e-05, + "loss": 0.7437, + "step": 90 + }, + { + "epoch": 0.49, + "learning_rate": 1.8328690807799444e-05, + "loss": 0.6934, + "step": 91 + }, + { + "epoch": 0.49, + "learning_rate": 1.83008356545961e-05, + "loss": 0.6955, + "step": 92 + }, + { + "epoch": 0.5, + "learning_rate": 1.827298050139276e-05, + "loss": 0.7427, + "step": 93 + }, + { + "epoch": 0.5, + "learning_rate": 1.8245125348189418e-05, + "loss": 0.7353, + "step": 94 + }, + { + "epoch": 0.51, + "learning_rate": 1.8217270194986074e-05, + "loss": 0.7241, + "step": 95 + }, + { + "epoch": 0.51, + "learning_rate": 1.8189415041782733e-05, + "loss": 0.6855, + "step": 96 + }, + { + "epoch": 0.52, + "learning_rate": 1.816155988857939e-05, + "loss": 0.7952, + "step": 97 + }, + { + "epoch": 0.52, + "learning_rate": 1.8133704735376044e-05, + "loss": 0.7749, + "step": 98 + }, + { + "epoch": 0.53, + "learning_rate": 1.8105849582172703e-05, + "loss": 0.7338, + "step": 99 + }, + { + "epoch": 0.53, + "learning_rate": 1.8077994428969362e-05, + "loss": 0.7474, + "step": 100 + }, + { + "epoch": 0.54, + "learning_rate": 1.8050139275766018e-05, + "loss": 0.7679, + "step": 101 + }, + { + "epoch": 0.54, + "learning_rate": 1.8022284122562677e-05, + "loss": 0.745, + "step": 102 + }, + { + "epoch": 0.55, + "learning_rate": 1.7994428969359333e-05, + "loss": 0.6667, + "step": 103 + }, + { + "epoch": 0.55, + "learning_rate": 1.796657381615599e-05, + "loss": 0.7154, + "step": 104 + }, + { + "epoch": 0.56, + "learning_rate": 1.7938718662952648e-05, + "loss": 0.7107, + "step": 105 + }, + { + "epoch": 0.57, + "learning_rate": 1.7910863509749307e-05, + "loss": 0.7599, + "step": 106 + }, + { + "epoch": 0.57, + "learning_rate": 1.7883008356545963e-05, + "loss": 0.7588, + "step": 107 + }, + { + "epoch": 0.58, + "learning_rate": 1.785515320334262e-05, + "loss": 0.7296, + "step": 108 + }, + { + "epoch": 0.58, + "learning_rate": 1.7827298050139278e-05, + "loss": 0.7387, + "step": 109 + }, + { + "epoch": 0.59, + "learning_rate": 1.7799442896935933e-05, + "loss": 0.7597, + "step": 110 + }, + { + "epoch": 0.59, + "learning_rate": 1.7771587743732592e-05, + "loss": 0.6698, + "step": 111 + }, + { + "epoch": 0.6, + "learning_rate": 1.774373259052925e-05, + "loss": 0.7372, + "step": 112 + }, + { + "epoch": 0.6, + "learning_rate": 1.7715877437325907e-05, + "loss": 0.7161, + "step": 113 + }, + { + "epoch": 0.61, + "learning_rate": 1.7688022284122563e-05, + "loss": 0.7396, + "step": 114 + }, + { + "epoch": 0.61, + "learning_rate": 1.7660167130919222e-05, + "loss": 0.7416, + "step": 115 + }, + { + "epoch": 0.62, + "learning_rate": 1.7632311977715878e-05, + "loss": 0.6572, + "step": 116 + }, + { + "epoch": 0.62, + "learning_rate": 1.7604456824512537e-05, + "loss": 0.7529, + "step": 117 + }, + { + "epoch": 0.63, + "learning_rate": 1.7576601671309196e-05, + "loss": 0.7582, + "step": 118 + }, + { + "epoch": 0.63, + "learning_rate": 1.7548746518105852e-05, + "loss": 0.6922, + "step": 119 + }, + { + "epoch": 0.64, + "learning_rate": 1.7520891364902508e-05, + "loss": 0.7142, + "step": 120 + }, + { + "epoch": 0.65, + "learning_rate": 1.7493036211699167e-05, + "loss": 0.6857, + "step": 121 + }, + { + "epoch": 0.65, + "learning_rate": 1.7465181058495822e-05, + "loss": 0.7728, + "step": 122 + }, + { + "epoch": 0.66, + "learning_rate": 1.743732590529248e-05, + "loss": 0.7027, + "step": 123 + }, + { + "epoch": 0.66, + "learning_rate": 1.7409470752089137e-05, + "loss": 0.7962, + "step": 124 + }, + { + "epoch": 0.67, + "learning_rate": 1.7381615598885796e-05, + "loss": 0.6832, + "step": 125 + }, + { + "epoch": 0.67, + "learning_rate": 1.7353760445682452e-05, + "loss": 0.6889, + "step": 126 + }, + { + "epoch": 0.68, + "learning_rate": 1.732590529247911e-05, + "loss": 0.6644, + "step": 127 + }, + { + "epoch": 0.68, + "learning_rate": 1.7298050139275767e-05, + "loss": 0.7192, + "step": 128 + }, + { + "epoch": 0.69, + "learning_rate": 1.7270194986072423e-05, + "loss": 0.6882, + "step": 129 + }, + { + "epoch": 0.69, + "learning_rate": 1.7242339832869082e-05, + "loss": 0.6616, + "step": 130 + }, + { + "epoch": 0.7, + "learning_rate": 1.721448467966574e-05, + "loss": 0.756, + "step": 131 + }, + { + "epoch": 0.7, + "learning_rate": 1.7186629526462397e-05, + "loss": 0.7184, + "step": 132 + }, + { + "epoch": 0.71, + "learning_rate": 1.7158774373259056e-05, + "loss": 0.7306, + "step": 133 + }, + { + "epoch": 0.71, + "learning_rate": 1.713091922005571e-05, + "loss": 0.6996, + "step": 134 + }, + { + "epoch": 0.72, + "learning_rate": 1.7103064066852367e-05, + "loss": 0.7146, + "step": 135 + }, + { + "epoch": 0.73, + "learning_rate": 1.7075208913649026e-05, + "loss": 0.7465, + "step": 136 + }, + { + "epoch": 0.73, + "learning_rate": 1.7047353760445685e-05, + "loss": 0.6778, + "step": 137 + }, + { + "epoch": 0.74, + "learning_rate": 1.701949860724234e-05, + "loss": 0.6771, + "step": 138 + }, + { + "epoch": 0.74, + "learning_rate": 1.6991643454039e-05, + "loss": 0.6967, + "step": 139 + }, + { + "epoch": 0.75, + "learning_rate": 1.6963788300835656e-05, + "loss": 0.7647, + "step": 140 + }, + { + "epoch": 0.75, + "learning_rate": 1.6935933147632312e-05, + "loss": 0.7284, + "step": 141 + }, + { + "epoch": 0.76, + "learning_rate": 1.690807799442897e-05, + "loss": 0.7412, + "step": 142 + }, + { + "epoch": 0.76, + "learning_rate": 1.688022284122563e-05, + "loss": 0.7198, + "step": 143 + }, + { + "epoch": 0.77, + "learning_rate": 1.6852367688022286e-05, + "loss": 0.6541, + "step": 144 + }, + { + "epoch": 0.77, + "learning_rate": 1.682451253481894e-05, + "loss": 0.6689, + "step": 145 + }, + { + "epoch": 0.78, + "learning_rate": 1.67966573816156e-05, + "loss": 0.7155, + "step": 146 + }, + { + "epoch": 0.78, + "learning_rate": 1.6768802228412256e-05, + "loss": 0.7244, + "step": 147 + }, + { + "epoch": 0.79, + "learning_rate": 1.6740947075208915e-05, + "loss": 0.6763, + "step": 148 + }, + { + "epoch": 0.79, + "learning_rate": 1.6713091922005575e-05, + "loss": 0.6826, + "step": 149 + }, + { + "epoch": 0.8, + "learning_rate": 1.668523676880223e-05, + "loss": 0.7343, + "step": 150 + }, + { + "epoch": 0.81, + "learning_rate": 1.6657381615598886e-05, + "loss": 0.7756, + "step": 151 + }, + { + "epoch": 0.81, + "learning_rate": 1.6629526462395545e-05, + "loss": 0.754, + "step": 152 + }, + { + "epoch": 0.82, + "learning_rate": 1.66016713091922e-05, + "loss": 0.692, + "step": 153 + }, + { + "epoch": 0.82, + "learning_rate": 1.657381615598886e-05, + "loss": 0.7303, + "step": 154 + }, + { + "epoch": 0.83, + "learning_rate": 1.654596100278552e-05, + "loss": 0.678, + "step": 155 + }, + { + "epoch": 0.83, + "learning_rate": 1.6518105849582175e-05, + "loss": 0.7851, + "step": 156 + }, + { + "epoch": 0.84, + "learning_rate": 1.649025069637883e-05, + "loss": 0.7123, + "step": 157 + }, + { + "epoch": 0.84, + "learning_rate": 1.646239554317549e-05, + "loss": 0.6835, + "step": 158 + }, + { + "epoch": 0.85, + "learning_rate": 1.6434540389972145e-05, + "loss": 0.7142, + "step": 159 + }, + { + "epoch": 0.85, + "learning_rate": 1.6406685236768805e-05, + "loss": 0.6943, + "step": 160 + }, + { + "epoch": 0.86, + "learning_rate": 1.637883008356546e-05, + "loss": 0.704, + "step": 161 + }, + { + "epoch": 0.86, + "learning_rate": 1.635097493036212e-05, + "loss": 0.7443, + "step": 162 + }, + { + "epoch": 0.87, + "learning_rate": 1.6323119777158775e-05, + "loss": 0.6624, + "step": 163 + }, + { + "epoch": 0.87, + "learning_rate": 1.6295264623955434e-05, + "loss": 0.7159, + "step": 164 + }, + { + "epoch": 0.88, + "learning_rate": 1.626740947075209e-05, + "loss": 0.6738, + "step": 165 + }, + { + "epoch": 0.89, + "learning_rate": 1.6239554317548746e-05, + "loss": 0.7127, + "step": 166 + }, + { + "epoch": 0.89, + "learning_rate": 1.6211699164345405e-05, + "loss": 0.7693, + "step": 167 + }, + { + "epoch": 0.9, + "learning_rate": 1.6183844011142064e-05, + "loss": 0.7973, + "step": 168 + }, + { + "epoch": 0.9, + "learning_rate": 1.615598885793872e-05, + "loss": 0.6875, + "step": 169 + }, + { + "epoch": 0.91, + "learning_rate": 1.612813370473538e-05, + "loss": 0.7091, + "step": 170 + }, + { + "epoch": 0.91, + "learning_rate": 1.6100278551532035e-05, + "loss": 0.6932, + "step": 171 + }, + { + "epoch": 0.92, + "learning_rate": 1.607242339832869e-05, + "loss": 0.6775, + "step": 172 + }, + { + "epoch": 0.92, + "learning_rate": 1.604456824512535e-05, + "loss": 0.7743, + "step": 173 + }, + { + "epoch": 0.93, + "learning_rate": 1.601671309192201e-05, + "loss": 0.6776, + "step": 174 + }, + { + "epoch": 0.93, + "learning_rate": 1.5988857938718664e-05, + "loss": 0.7271, + "step": 175 + }, + { + "epoch": 0.94, + "learning_rate": 1.5961002785515323e-05, + "loss": 0.6949, + "step": 176 + }, + { + "epoch": 0.94, + "learning_rate": 1.593314763231198e-05, + "loss": 0.7744, + "step": 177 + }, + { + "epoch": 0.95, + "learning_rate": 1.5905292479108635e-05, + "loss": 0.6984, + "step": 178 + }, + { + "epoch": 0.95, + "learning_rate": 1.5877437325905294e-05, + "loss": 0.6812, + "step": 179 + }, + { + "epoch": 0.96, + "learning_rate": 1.5849582172701953e-05, + "loss": 0.7237, + "step": 180 + }, + { + "epoch": 0.97, + "learning_rate": 1.582172701949861e-05, + "loss": 0.6699, + "step": 181 + }, + { + "epoch": 0.97, + "learning_rate": 1.5793871866295265e-05, + "loss": 0.7122, + "step": 182 + }, + { + "epoch": 0.98, + "learning_rate": 1.5766016713091924e-05, + "loss": 0.7157, + "step": 183 + }, + { + "epoch": 0.98, + "learning_rate": 1.573816155988858e-05, + "loss": 0.7429, + "step": 184 + }, + { + "epoch": 0.99, + "learning_rate": 1.571030640668524e-05, + "loss": 0.7287, + "step": 185 + }, + { + "epoch": 0.99, + "learning_rate": 1.5682451253481898e-05, + "loss": 0.7633, + "step": 186 + }, + { + "epoch": 1.0, + "learning_rate": 1.5654596100278553e-05, + "loss": 0.6603, + "step": 187 + }, + { + "epoch": 1.0, + "learning_rate": 1.562674094707521e-05, + "loss": 0.5963, + "step": 188 + }, + { + "epoch": 1.01, + "learning_rate": 1.5598885793871868e-05, + "loss": 0.6036, + "step": 189 + }, + { + "epoch": 1.01, + "learning_rate": 1.5571030640668524e-05, + "loss": 0.5169, + "step": 190 + }, + { + "epoch": 1.02, + "learning_rate": 1.5543175487465183e-05, + "loss": 0.5438, + "step": 191 + }, + { + "epoch": 1.02, + "learning_rate": 1.5515320334261842e-05, + "loss": 0.5503, + "step": 192 + }, + { + "epoch": 1.03, + "learning_rate": 1.5487465181058498e-05, + "loss": 0.5024, + "step": 193 + }, + { + "epoch": 1.03, + "learning_rate": 1.5459610027855154e-05, + "loss": 0.5534, + "step": 194 + }, + { + "epoch": 1.04, + "learning_rate": 1.5431754874651813e-05, + "loss": 0.5603, + "step": 195 + }, + { + "epoch": 1.05, + "learning_rate": 1.540389972144847e-05, + "loss": 0.5362, + "step": 196 + }, + { + "epoch": 1.05, + "learning_rate": 1.5376044568245128e-05, + "loss": 0.5615, + "step": 197 + }, + { + "epoch": 1.06, + "learning_rate": 1.5348189415041783e-05, + "loss": 0.5342, + "step": 198 + }, + { + "epoch": 1.06, + "learning_rate": 1.5320334261838443e-05, + "loss": 0.4855, + "step": 199 + }, + { + "epoch": 1.07, + "learning_rate": 1.5292479108635098e-05, + "loss": 0.4784, + "step": 200 + }, + { + "epoch": 1.07, + "learning_rate": 1.5264623955431757e-05, + "loss": 0.5959, + "step": 201 + }, + { + "epoch": 1.08, + "learning_rate": 1.5236768802228415e-05, + "loss": 0.54, + "step": 202 + }, + { + "epoch": 1.08, + "learning_rate": 1.520891364902507e-05, + "loss": 0.5162, + "step": 203 + }, + { + "epoch": 1.09, + "learning_rate": 1.5181058495821728e-05, + "loss": 0.4735, + "step": 204 + }, + { + "epoch": 1.09, + "learning_rate": 1.5153203342618385e-05, + "loss": 0.5283, + "step": 205 + }, + { + "epoch": 1.1, + "learning_rate": 1.5125348189415043e-05, + "loss": 0.5097, + "step": 206 + }, + { + "epoch": 1.1, + "learning_rate": 1.5097493036211702e-05, + "loss": 0.5125, + "step": 207 + }, + { + "epoch": 1.11, + "learning_rate": 1.5069637883008356e-05, + "loss": 0.4672, + "step": 208 + }, + { + "epoch": 1.11, + "learning_rate": 1.5041782729805015e-05, + "loss": 0.5211, + "step": 209 + }, + { + "epoch": 1.12, + "learning_rate": 1.5013927576601673e-05, + "loss": 0.4612, + "step": 210 + }, + { + "epoch": 1.13, + "learning_rate": 1.498607242339833e-05, + "loss": 0.4927, + "step": 211 + }, + { + "epoch": 1.13, + "learning_rate": 1.4958217270194987e-05, + "loss": 0.5622, + "step": 212 + }, + { + "epoch": 1.14, + "learning_rate": 1.4930362116991646e-05, + "loss": 0.4966, + "step": 213 + }, + { + "epoch": 1.14, + "learning_rate": 1.49025069637883e-05, + "loss": 0.5457, + "step": 214 + }, + { + "epoch": 1.15, + "learning_rate": 1.487465181058496e-05, + "loss": 0.5371, + "step": 215 + }, + { + "epoch": 1.15, + "learning_rate": 1.4846796657381617e-05, + "loss": 0.4752, + "step": 216 + }, + { + "epoch": 1.16, + "learning_rate": 1.4818941504178274e-05, + "loss": 0.5204, + "step": 217 + }, + { + "epoch": 1.16, + "learning_rate": 1.4791086350974932e-05, + "loss": 0.5688, + "step": 218 + }, + { + "epoch": 1.17, + "learning_rate": 1.4763231197771588e-05, + "loss": 0.5434, + "step": 219 + }, + { + "epoch": 1.17, + "learning_rate": 1.4735376044568245e-05, + "loss": 0.5623, + "step": 220 + }, + { + "epoch": 1.18, + "learning_rate": 1.4707520891364904e-05, + "loss": 0.5338, + "step": 221 + }, + { + "epoch": 1.18, + "learning_rate": 1.4679665738161562e-05, + "loss": 0.5795, + "step": 222 + }, + { + "epoch": 1.19, + "learning_rate": 1.4651810584958219e-05, + "loss": 0.5234, + "step": 223 + }, + { + "epoch": 1.19, + "learning_rate": 1.4623955431754875e-05, + "loss": 0.52, + "step": 224 + }, + { + "epoch": 1.2, + "learning_rate": 1.4596100278551532e-05, + "loss": 0.5136, + "step": 225 + }, + { + "epoch": 1.21, + "learning_rate": 1.456824512534819e-05, + "loss": 0.5022, + "step": 226 + }, + { + "epoch": 1.21, + "learning_rate": 1.4540389972144849e-05, + "loss": 0.4752, + "step": 227 + }, + { + "epoch": 1.22, + "learning_rate": 1.4512534818941506e-05, + "loss": 0.4854, + "step": 228 + }, + { + "epoch": 1.22, + "learning_rate": 1.4484679665738162e-05, + "loss": 0.5661, + "step": 229 + }, + { + "epoch": 1.23, + "learning_rate": 1.445682451253482e-05, + "loss": 0.5114, + "step": 230 + }, + { + "epoch": 1.23, + "learning_rate": 1.4428969359331477e-05, + "loss": 0.5129, + "step": 231 + }, + { + "epoch": 1.24, + "learning_rate": 1.4401114206128134e-05, + "loss": 0.5214, + "step": 232 + }, + { + "epoch": 1.24, + "learning_rate": 1.4373259052924793e-05, + "loss": 0.542, + "step": 233 + }, + { + "epoch": 1.25, + "learning_rate": 1.434540389972145e-05, + "loss": 0.442, + "step": 234 + }, + { + "epoch": 1.25, + "learning_rate": 1.4317548746518106e-05, + "loss": 0.5439, + "step": 235 + }, + { + "epoch": 1.26, + "learning_rate": 1.4289693593314764e-05, + "loss": 0.5376, + "step": 236 + }, + { + "epoch": 1.26, + "learning_rate": 1.4261838440111421e-05, + "loss": 0.4834, + "step": 237 + }, + { + "epoch": 1.27, + "learning_rate": 1.423398328690808e-05, + "loss": 0.4668, + "step": 238 + }, + { + "epoch": 1.27, + "learning_rate": 1.4206128133704738e-05, + "loss": 0.5349, + "step": 239 + }, + { + "epoch": 1.28, + "learning_rate": 1.4178272980501394e-05, + "loss": 0.536, + "step": 240 + }, + { + "epoch": 1.29, + "learning_rate": 1.4150417827298051e-05, + "loss": 0.578, + "step": 241 + }, + { + "epoch": 1.29, + "learning_rate": 1.4122562674094708e-05, + "loss": 0.4989, + "step": 242 + }, + { + "epoch": 1.3, + "learning_rate": 1.4094707520891366e-05, + "loss": 0.4977, + "step": 243 + }, + { + "epoch": 1.3, + "learning_rate": 1.4066852367688025e-05, + "loss": 0.4675, + "step": 244 + }, + { + "epoch": 1.31, + "learning_rate": 1.4038997214484679e-05, + "loss": 0.591, + "step": 245 + }, + { + "epoch": 1.31, + "learning_rate": 1.4011142061281338e-05, + "loss": 0.5557, + "step": 246 + }, + { + "epoch": 1.32, + "learning_rate": 1.3983286908077996e-05, + "loss": 0.4777, + "step": 247 + }, + { + "epoch": 1.32, + "learning_rate": 1.3955431754874653e-05, + "loss": 0.5341, + "step": 248 + }, + { + "epoch": 1.33, + "learning_rate": 1.392757660167131e-05, + "loss": 0.5668, + "step": 249 + }, + { + "epoch": 1.33, + "learning_rate": 1.389972144846797e-05, + "loss": 0.5738, + "step": 250 + }, + { + "epoch": 1.34, + "learning_rate": 1.3871866295264624e-05, + "loss": 0.531, + "step": 251 + }, + { + "epoch": 1.34, + "learning_rate": 1.3844011142061283e-05, + "loss": 0.5138, + "step": 252 + }, + { + "epoch": 1.35, + "learning_rate": 1.381615598885794e-05, + "loss": 0.5201, + "step": 253 + }, + { + "epoch": 1.35, + "learning_rate": 1.3788300835654598e-05, + "loss": 0.5208, + "step": 254 + }, + { + "epoch": 1.36, + "learning_rate": 1.3760445682451255e-05, + "loss": 0.5228, + "step": 255 + }, + { + "epoch": 1.37, + "learning_rate": 1.373259052924791e-05, + "loss": 0.4848, + "step": 256 + }, + { + "epoch": 1.37, + "learning_rate": 1.3704735376044568e-05, + "loss": 0.5285, + "step": 257 + }, + { + "epoch": 1.38, + "learning_rate": 1.3676880222841227e-05, + "loss": 0.4899, + "step": 258 + }, + { + "epoch": 1.38, + "learning_rate": 1.3649025069637885e-05, + "loss": 0.5077, + "step": 259 + }, + { + "epoch": 1.39, + "learning_rate": 1.3621169916434542e-05, + "loss": 0.4493, + "step": 260 + }, + { + "epoch": 1.39, + "learning_rate": 1.3593314763231198e-05, + "loss": 0.5259, + "step": 261 + }, + { + "epoch": 1.4, + "learning_rate": 1.3565459610027855e-05, + "loss": 0.5429, + "step": 262 + }, + { + "epoch": 1.4, + "learning_rate": 1.3537604456824513e-05, + "loss": 0.564, + "step": 263 + }, + { + "epoch": 1.41, + "learning_rate": 1.3509749303621172e-05, + "loss": 0.5411, + "step": 264 + }, + { + "epoch": 1.41, + "learning_rate": 1.348189415041783e-05, + "loss": 0.4892, + "step": 265 + }, + { + "epoch": 1.42, + "learning_rate": 1.3454038997214485e-05, + "loss": 0.5519, + "step": 266 + }, + { + "epoch": 1.42, + "learning_rate": 1.3426183844011142e-05, + "loss": 0.516, + "step": 267 + }, + { + "epoch": 1.43, + "learning_rate": 1.33983286908078e-05, + "loss": 0.5744, + "step": 268 + }, + { + "epoch": 1.43, + "learning_rate": 1.3370473537604457e-05, + "loss": 0.5618, + "step": 269 + }, + { + "epoch": 1.44, + "learning_rate": 1.3342618384401116e-05, + "loss": 0.4882, + "step": 270 + }, + { + "epoch": 1.45, + "learning_rate": 1.3314763231197774e-05, + "loss": 0.5292, + "step": 271 + }, + { + "epoch": 1.45, + "learning_rate": 1.328690807799443e-05, + "loss": 0.5124, + "step": 272 + }, + { + "epoch": 1.46, + "learning_rate": 1.3259052924791087e-05, + "loss": 0.5615, + "step": 273 + }, + { + "epoch": 1.46, + "learning_rate": 1.3231197771587744e-05, + "loss": 0.5456, + "step": 274 + }, + { + "epoch": 1.47, + "learning_rate": 1.3203342618384402e-05, + "loss": 0.4812, + "step": 275 + }, + { + "epoch": 1.47, + "learning_rate": 1.3175487465181061e-05, + "loss": 0.4531, + "step": 276 + }, + { + "epoch": 1.48, + "learning_rate": 1.3147632311977717e-05, + "loss": 0.5586, + "step": 277 + }, + { + "epoch": 1.48, + "learning_rate": 1.3119777158774374e-05, + "loss": 0.4996, + "step": 278 + }, + { + "epoch": 1.49, + "learning_rate": 1.3091922005571032e-05, + "loss": 0.5681, + "step": 279 + }, + { + "epoch": 1.49, + "learning_rate": 1.3064066852367689e-05, + "loss": 0.5586, + "step": 280 + }, + { + "epoch": 1.5, + "learning_rate": 1.3036211699164346e-05, + "loss": 0.4863, + "step": 281 + }, + { + "epoch": 1.5, + "learning_rate": 1.3008356545961002e-05, + "loss": 0.5528, + "step": 282 + }, + { + "epoch": 1.51, + "learning_rate": 1.2980501392757661e-05, + "loss": 0.4584, + "step": 283 + }, + { + "epoch": 1.51, + "learning_rate": 1.2952646239554319e-05, + "loss": 0.5344, + "step": 284 + }, + { + "epoch": 1.52, + "learning_rate": 1.2924791086350976e-05, + "loss": 0.5193, + "step": 285 + }, + { + "epoch": 1.53, + "learning_rate": 1.2896935933147634e-05, + "loss": 0.4992, + "step": 286 + }, + { + "epoch": 1.53, + "learning_rate": 1.2869080779944293e-05, + "loss": 0.4881, + "step": 287 + }, + { + "epoch": 1.54, + "learning_rate": 1.2841225626740947e-05, + "loss": 0.5232, + "step": 288 + }, + { + "epoch": 1.54, + "learning_rate": 1.2813370473537606e-05, + "loss": 0.4895, + "step": 289 + }, + { + "epoch": 1.55, + "learning_rate": 1.2785515320334263e-05, + "loss": 0.5072, + "step": 290 + }, + { + "epoch": 1.55, + "learning_rate": 1.275766016713092e-05, + "loss": 0.5274, + "step": 291 + }, + { + "epoch": 1.56, + "learning_rate": 1.2729805013927578e-05, + "loss": 0.5205, + "step": 292 + }, + { + "epoch": 1.56, + "learning_rate": 1.2701949860724234e-05, + "loss": 0.5495, + "step": 293 + }, + { + "epoch": 1.57, + "learning_rate": 1.2674094707520891e-05, + "loss": 0.5012, + "step": 294 + }, + { + "epoch": 1.57, + "learning_rate": 1.264623955431755e-05, + "loss": 0.547, + "step": 295 + }, + { + "epoch": 1.58, + "learning_rate": 1.2618384401114208e-05, + "loss": 0.5127, + "step": 296 + }, + { + "epoch": 1.58, + "learning_rate": 1.2590529247910865e-05, + "loss": 0.5416, + "step": 297 + }, + { + "epoch": 1.59, + "learning_rate": 1.2562674094707521e-05, + "loss": 0.5356, + "step": 298 + }, + { + "epoch": 1.59, + "learning_rate": 1.2534818941504178e-05, + "loss": 0.5364, + "step": 299 + }, + { + "epoch": 1.6, + "learning_rate": 1.2506963788300836e-05, + "loss": 0.4805, + "step": 300 + }, + { + "epoch": 1.61, + "learning_rate": 1.2479108635097495e-05, + "loss": 0.529, + "step": 301 + }, + { + "epoch": 1.61, + "learning_rate": 1.2451253481894152e-05, + "loss": 0.5331, + "step": 302 + }, + { + "epoch": 1.62, + "learning_rate": 1.2423398328690808e-05, + "loss": 0.5387, + "step": 303 + }, + { + "epoch": 1.62, + "learning_rate": 1.2395543175487466e-05, + "loss": 0.5064, + "step": 304 + }, + { + "epoch": 1.63, + "learning_rate": 1.2367688022284123e-05, + "loss": 0.4896, + "step": 305 + }, + { + "epoch": 1.63, + "learning_rate": 1.233983286908078e-05, + "loss": 0.5482, + "step": 306 + }, + { + "epoch": 1.64, + "learning_rate": 1.231197771587744e-05, + "loss": 0.4779, + "step": 307 + }, + { + "epoch": 1.64, + "learning_rate": 1.2284122562674097e-05, + "loss": 0.5335, + "step": 308 + }, + { + "epoch": 1.65, + "learning_rate": 1.2256267409470753e-05, + "loss": 0.5403, + "step": 309 + }, + { + "epoch": 1.65, + "learning_rate": 1.222841225626741e-05, + "loss": 0.558, + "step": 310 + }, + { + "epoch": 1.66, + "learning_rate": 1.2200557103064068e-05, + "loss": 0.537, + "step": 311 + }, + { + "epoch": 1.66, + "learning_rate": 1.2172701949860725e-05, + "loss": 0.5269, + "step": 312 + }, + { + "epoch": 1.67, + "learning_rate": 1.2144846796657384e-05, + "loss": 0.5331, + "step": 313 + }, + { + "epoch": 1.67, + "learning_rate": 1.211699164345404e-05, + "loss": 0.5466, + "step": 314 + }, + { + "epoch": 1.68, + "learning_rate": 1.2089136490250697e-05, + "loss": 0.4924, + "step": 315 + }, + { + "epoch": 1.69, + "learning_rate": 1.2061281337047355e-05, + "loss": 0.4994, + "step": 316 + }, + { + "epoch": 1.69, + "learning_rate": 1.2033426183844012e-05, + "loss": 0.6143, + "step": 317 + }, + { + "epoch": 1.7, + "learning_rate": 1.200557103064067e-05, + "loss": 0.5136, + "step": 318 + }, + { + "epoch": 1.7, + "learning_rate": 1.1977715877437325e-05, + "loss": 0.5942, + "step": 319 + }, + { + "epoch": 1.71, + "learning_rate": 1.1949860724233984e-05, + "loss": 0.4808, + "step": 320 + }, + { + "epoch": 1.71, + "learning_rate": 1.1922005571030642e-05, + "loss": 0.4621, + "step": 321 + }, + { + "epoch": 1.72, + "learning_rate": 1.18941504178273e-05, + "loss": 0.5023, + "step": 322 + }, + { + "epoch": 1.72, + "learning_rate": 1.1866295264623957e-05, + "loss": 0.5064, + "step": 323 + }, + { + "epoch": 1.73, + "learning_rate": 1.1838440111420614e-05, + "loss": 0.5273, + "step": 324 + }, + { + "epoch": 1.73, + "learning_rate": 1.181058495821727e-05, + "loss": 0.4781, + "step": 325 + }, + { + "epoch": 1.74, + "learning_rate": 1.1782729805013929e-05, + "loss": 0.5084, + "step": 326 + }, + { + "epoch": 1.74, + "learning_rate": 1.1754874651810586e-05, + "loss": 0.5402, + "step": 327 + }, + { + "epoch": 1.75, + "learning_rate": 1.1727019498607244e-05, + "loss": 0.5296, + "step": 328 + }, + { + "epoch": 1.75, + "learning_rate": 1.1699164345403901e-05, + "loss": 0.5295, + "step": 329 + }, + { + "epoch": 1.76, + "learning_rate": 1.1671309192200557e-05, + "loss": 0.4955, + "step": 330 + }, + { + "epoch": 1.77, + "learning_rate": 1.1643454038997214e-05, + "loss": 0.5077, + "step": 331 + }, + { + "epoch": 1.77, + "learning_rate": 1.1615598885793873e-05, + "loss": 0.5104, + "step": 332 + }, + { + "epoch": 1.78, + "learning_rate": 1.1587743732590531e-05, + "loss": 0.5178, + "step": 333 + }, + { + "epoch": 1.78, + "learning_rate": 1.1559888579387188e-05, + "loss": 0.493, + "step": 334 + }, + { + "epoch": 1.79, + "learning_rate": 1.1532033426183844e-05, + "loss": 0.5106, + "step": 335 + }, + { + "epoch": 1.79, + "learning_rate": 1.1504178272980501e-05, + "loss": 0.4425, + "step": 336 + }, + { + "epoch": 1.8, + "learning_rate": 1.1476323119777159e-05, + "loss": 0.4949, + "step": 337 + }, + { + "epoch": 1.8, + "learning_rate": 1.1448467966573818e-05, + "loss": 0.5441, + "step": 338 + }, + { + "epoch": 1.81, + "learning_rate": 1.1420612813370475e-05, + "loss": 0.5127, + "step": 339 + }, + { + "epoch": 1.81, + "learning_rate": 1.1392757660167131e-05, + "loss": 0.5445, + "step": 340 + }, + { + "epoch": 1.82, + "learning_rate": 1.1364902506963789e-05, + "loss": 0.4872, + "step": 341 + }, + { + "epoch": 1.82, + "learning_rate": 1.1337047353760446e-05, + "loss": 0.515, + "step": 342 + }, + { + "epoch": 1.83, + "learning_rate": 1.1309192200557103e-05, + "loss": 0.5759, + "step": 343 + }, + { + "epoch": 1.83, + "learning_rate": 1.1281337047353763e-05, + "loss": 0.5121, + "step": 344 + }, + { + "epoch": 1.84, + "learning_rate": 1.125348189415042e-05, + "loss": 0.5123, + "step": 345 + }, + { + "epoch": 1.85, + "learning_rate": 1.1225626740947076e-05, + "loss": 0.5289, + "step": 346 + }, + { + "epoch": 1.85, + "learning_rate": 1.1197771587743733e-05, + "loss": 0.482, + "step": 347 + }, + { + "epoch": 1.86, + "learning_rate": 1.116991643454039e-05, + "loss": 0.5437, + "step": 348 + }, + { + "epoch": 1.86, + "learning_rate": 1.1142061281337048e-05, + "loss": 0.5802, + "step": 349 + }, + { + "epoch": 1.87, + "learning_rate": 1.1114206128133707e-05, + "loss": 0.4926, + "step": 350 + }, + { + "epoch": 1.87, + "learning_rate": 1.1086350974930363e-05, + "loss": 0.5314, + "step": 351 + }, + { + "epoch": 1.88, + "learning_rate": 1.105849582172702e-05, + "loss": 0.5146, + "step": 352 + }, + { + "epoch": 1.88, + "learning_rate": 1.1030640668523678e-05, + "loss": 0.5236, + "step": 353 + }, + { + "epoch": 1.89, + "learning_rate": 1.1002785515320335e-05, + "loss": 0.5389, + "step": 354 + }, + { + "epoch": 1.89, + "learning_rate": 1.0974930362116993e-05, + "loss": 0.4849, + "step": 355 + }, + { + "epoch": 1.9, + "learning_rate": 1.0947075208913648e-05, + "loss": 0.4656, + "step": 356 + }, + { + "epoch": 1.9, + "learning_rate": 1.0919220055710307e-05, + "loss": 0.5118, + "step": 357 + }, + { + "epoch": 1.91, + "learning_rate": 1.0891364902506965e-05, + "loss": 0.5105, + "step": 358 + }, + { + "epoch": 1.91, + "learning_rate": 1.0863509749303622e-05, + "loss": 0.5027, + "step": 359 + }, + { + "epoch": 1.92, + "learning_rate": 1.083565459610028e-05, + "loss": 0.4993, + "step": 360 + }, + { + "epoch": 1.93, + "learning_rate": 1.0807799442896935e-05, + "loss": 0.5181, + "step": 361 + }, + { + "epoch": 1.93, + "learning_rate": 1.0779944289693593e-05, + "loss": 0.4739, + "step": 362 + }, + { + "epoch": 1.94, + "learning_rate": 1.0752089136490252e-05, + "loss": 0.5519, + "step": 363 + }, + { + "epoch": 1.94, + "learning_rate": 1.072423398328691e-05, + "loss": 0.5058, + "step": 364 + }, + { + "epoch": 1.95, + "learning_rate": 1.0696378830083567e-05, + "loss": 0.5659, + "step": 365 + }, + { + "epoch": 1.95, + "learning_rate": 1.0668523676880224e-05, + "loss": 0.53, + "step": 366 + }, + { + "epoch": 1.96, + "learning_rate": 1.064066852367688e-05, + "loss": 0.4817, + "step": 367 + }, + { + "epoch": 1.96, + "learning_rate": 1.0612813370473537e-05, + "loss": 0.4716, + "step": 368 + }, + { + "epoch": 1.97, + "learning_rate": 1.0584958217270197e-05, + "loss": 0.4762, + "step": 369 + }, + { + "epoch": 1.97, + "learning_rate": 1.0557103064066854e-05, + "loss": 0.5046, + "step": 370 + }, + { + "epoch": 1.98, + "learning_rate": 1.0529247910863511e-05, + "loss": 0.4578, + "step": 371 + }, + { + "epoch": 1.98, + "learning_rate": 1.0501392757660167e-05, + "loss": 0.517, + "step": 372 + }, + { + "epoch": 1.99, + "learning_rate": 1.0473537604456825e-05, + "loss": 0.5387, + "step": 373 + }, + { + "epoch": 1.99, + "learning_rate": 1.0445682451253482e-05, + "loss": 0.4909, + "step": 374 + }, + { + "epoch": 2.0, + "learning_rate": 1.0417827298050141e-05, + "loss": 0.5001, + "step": 375 + }, + { + "epoch": 2.01, + "learning_rate": 1.0389972144846799e-05, + "loss": 0.4055, + "step": 376 + }, + { + "epoch": 2.01, + "learning_rate": 1.0362116991643454e-05, + "loss": 0.4004, + "step": 377 + }, + { + "epoch": 2.02, + "learning_rate": 1.0334261838440112e-05, + "loss": 0.3153, + "step": 378 + }, + { + "epoch": 2.02, + "learning_rate": 1.0306406685236769e-05, + "loss": 0.3621, + "step": 379 + }, + { + "epoch": 2.03, + "learning_rate": 1.0278551532033427e-05, + "loss": 0.382, + "step": 380 + }, + { + "epoch": 2.03, + "learning_rate": 1.0250696378830086e-05, + "loss": 0.3887, + "step": 381 + }, + { + "epoch": 2.04, + "learning_rate": 1.0222841225626743e-05, + "loss": 0.3225, + "step": 382 + }, + { + "epoch": 2.04, + "learning_rate": 1.0194986072423399e-05, + "loss": 0.4025, + "step": 383 + }, + { + "epoch": 2.05, + "learning_rate": 1.0167130919220056e-05, + "loss": 0.348, + "step": 384 + }, + { + "epoch": 2.05, + "learning_rate": 1.0139275766016714e-05, + "loss": 0.3643, + "step": 385 + }, + { + "epoch": 2.06, + "learning_rate": 1.0111420612813371e-05, + "loss": 0.311, + "step": 386 + }, + { + "epoch": 2.06, + "learning_rate": 1.008356545961003e-05, + "loss": 0.3995, + "step": 387 + }, + { + "epoch": 2.07, + "learning_rate": 1.0055710306406686e-05, + "loss": 0.3274, + "step": 388 + }, + { + "epoch": 2.07, + "learning_rate": 1.0027855153203343e-05, + "loss": 0.325, + "step": 389 + }, + { + "epoch": 2.08, + "learning_rate": 1e-05, + "loss": 0.3428, + "step": 390 + }, + { + "epoch": 2.09, + "learning_rate": 9.972144846796658e-06, + "loss": 0.3302, + "step": 391 + }, + { + "epoch": 2.09, + "learning_rate": 9.944289693593316e-06, + "loss": 0.3219, + "step": 392 + }, + { + "epoch": 2.1, + "learning_rate": 9.916434540389973e-06, + "loss": 0.3551, + "step": 393 + }, + { + "epoch": 2.1, + "learning_rate": 9.88857938718663e-06, + "loss": 0.4043, + "step": 394 + }, + { + "epoch": 2.11, + "learning_rate": 9.860724233983288e-06, + "loss": 0.3487, + "step": 395 + }, + { + "epoch": 2.11, + "learning_rate": 9.832869080779945e-06, + "loss": 0.3675, + "step": 396 + }, + { + "epoch": 2.12, + "learning_rate": 9.805013927576603e-06, + "loss": 0.3365, + "step": 397 + }, + { + "epoch": 2.12, + "learning_rate": 9.77715877437326e-06, + "loss": 0.3865, + "step": 398 + }, + { + "epoch": 2.13, + "learning_rate": 9.749303621169918e-06, + "loss": 0.4085, + "step": 399 + }, + { + "epoch": 2.13, + "learning_rate": 9.721448467966575e-06, + "loss": 0.3027, + "step": 400 + }, + { + "epoch": 2.14, + "learning_rate": 9.693593314763233e-06, + "loss": 0.3313, + "step": 401 + }, + { + "epoch": 2.14, + "learning_rate": 9.665738161559888e-06, + "loss": 0.3695, + "step": 402 + }, + { + "epoch": 2.15, + "learning_rate": 9.637883008356547e-06, + "loss": 0.4053, + "step": 403 + }, + { + "epoch": 2.15, + "learning_rate": 9.610027855153205e-06, + "loss": 0.3128, + "step": 404 + }, + { + "epoch": 2.16, + "learning_rate": 9.58217270194986e-06, + "loss": 0.3731, + "step": 405 + }, + { + "epoch": 2.17, + "learning_rate": 9.55431754874652e-06, + "loss": 0.4123, + "step": 406 + }, + { + "epoch": 2.17, + "learning_rate": 9.526462395543177e-06, + "loss": 0.3521, + "step": 407 + }, + { + "epoch": 2.18, + "learning_rate": 9.498607242339833e-06, + "loss": 0.377, + "step": 408 + }, + { + "epoch": 2.18, + "learning_rate": 9.470752089136492e-06, + "loss": 0.3483, + "step": 409 + }, + { + "epoch": 2.19, + "learning_rate": 9.442896935933148e-06, + "loss": 0.3194, + "step": 410 + }, + { + "epoch": 2.19, + "learning_rate": 9.415041782729805e-06, + "loss": 0.3634, + "step": 411 + }, + { + "epoch": 2.2, + "learning_rate": 9.387186629526464e-06, + "loss": 0.364, + "step": 412 + }, + { + "epoch": 2.2, + "learning_rate": 9.35933147632312e-06, + "loss": 0.3984, + "step": 413 + }, + { + "epoch": 2.21, + "learning_rate": 9.331476323119777e-06, + "loss": 0.3943, + "step": 414 + }, + { + "epoch": 2.21, + "learning_rate": 9.303621169916436e-06, + "loss": 0.3508, + "step": 415 + }, + { + "epoch": 2.22, + "learning_rate": 9.275766016713092e-06, + "loss": 0.2955, + "step": 416 + }, + { + "epoch": 2.22, + "learning_rate": 9.24791086350975e-06, + "loss": 0.3196, + "step": 417 + }, + { + "epoch": 2.23, + "learning_rate": 9.220055710306407e-06, + "loss": 0.332, + "step": 418 + }, + { + "epoch": 2.23, + "learning_rate": 9.192200557103064e-06, + "loss": 0.3548, + "step": 419 + }, + { + "epoch": 2.24, + "learning_rate": 9.164345403899722e-06, + "loss": 0.326, + "step": 420 + }, + { + "epoch": 2.25, + "learning_rate": 9.13649025069638e-06, + "loss": 0.386, + "step": 421 + }, + { + "epoch": 2.25, + "learning_rate": 9.108635097493037e-06, + "loss": 0.3499, + "step": 422 + }, + { + "epoch": 2.26, + "learning_rate": 9.080779944289694e-06, + "loss": 0.3918, + "step": 423 + }, + { + "epoch": 2.26, + "learning_rate": 9.052924791086352e-06, + "loss": 0.3767, + "step": 424 + }, + { + "epoch": 2.27, + "learning_rate": 9.025069637883009e-06, + "loss": 0.3291, + "step": 425 + }, + { + "epoch": 2.27, + "learning_rate": 8.997214484679666e-06, + "loss": 0.2909, + "step": 426 + }, + { + "epoch": 2.28, + "learning_rate": 8.969359331476324e-06, + "loss": 0.3823, + "step": 427 + }, + { + "epoch": 2.28, + "learning_rate": 8.941504178272981e-06, + "loss": 0.3298, + "step": 428 + }, + { + "epoch": 2.29, + "learning_rate": 8.913649025069639e-06, + "loss": 0.4001, + "step": 429 + }, + { + "epoch": 2.29, + "learning_rate": 8.885793871866296e-06, + "loss": 0.3899, + "step": 430 + }, + { + "epoch": 2.3, + "learning_rate": 8.857938718662954e-06, + "loss": 0.3752, + "step": 431 + }, + { + "epoch": 2.3, + "learning_rate": 8.830083565459611e-06, + "loss": 0.3568, + "step": 432 + }, + { + "epoch": 2.31, + "learning_rate": 8.802228412256268e-06, + "loss": 0.3635, + "step": 433 + }, + { + "epoch": 2.31, + "learning_rate": 8.774373259052926e-06, + "loss": 0.4041, + "step": 434 + }, + { + "epoch": 2.32, + "learning_rate": 8.746518105849583e-06, + "loss": 0.3283, + "step": 435 + }, + { + "epoch": 2.33, + "learning_rate": 8.71866295264624e-06, + "loss": 0.3321, + "step": 436 + }, + { + "epoch": 2.33, + "learning_rate": 8.690807799442898e-06, + "loss": 0.343, + "step": 437 + }, + { + "epoch": 2.34, + "learning_rate": 8.662952646239556e-06, + "loss": 0.3591, + "step": 438 + }, + { + "epoch": 2.34, + "learning_rate": 8.635097493036211e-06, + "loss": 0.3573, + "step": 439 + }, + { + "epoch": 2.35, + "learning_rate": 8.60724233983287e-06, + "loss": 0.365, + "step": 440 + }, + { + "epoch": 2.35, + "learning_rate": 8.579387186629528e-06, + "loss": 0.3942, + "step": 441 + }, + { + "epoch": 2.36, + "learning_rate": 8.551532033426184e-06, + "loss": 0.3404, + "step": 442 + }, + { + "epoch": 2.36, + "learning_rate": 8.523676880222843e-06, + "loss": 0.3674, + "step": 443 + }, + { + "epoch": 2.37, + "learning_rate": 8.4958217270195e-06, + "loss": 0.4201, + "step": 444 + }, + { + "epoch": 2.37, + "learning_rate": 8.467966573816156e-06, + "loss": 0.3486, + "step": 445 + }, + { + "epoch": 2.38, + "learning_rate": 8.440111420612815e-06, + "loss": 0.3256, + "step": 446 + }, + { + "epoch": 2.38, + "learning_rate": 8.41225626740947e-06, + "loss": 0.33, + "step": 447 + }, + { + "epoch": 2.39, + "learning_rate": 8.384401114206128e-06, + "loss": 0.2918, + "step": 448 + }, + { + "epoch": 2.39, + "learning_rate": 8.356545961002787e-06, + "loss": 0.3151, + "step": 449 + }, + { + "epoch": 2.4, + "learning_rate": 8.328690807799443e-06, + "loss": 0.3438, + "step": 450 + }, + { + "epoch": 2.41, + "learning_rate": 8.3008356545961e-06, + "loss": 0.3632, + "step": 451 + }, + { + "epoch": 2.41, + "learning_rate": 8.27298050139276e-06, + "loss": 0.3254, + "step": 452 + }, + { + "epoch": 2.42, + "learning_rate": 8.245125348189415e-06, + "loss": 0.3253, + "step": 453 + }, + { + "epoch": 2.42, + "learning_rate": 8.217270194986073e-06, + "loss": 0.3673, + "step": 454 + }, + { + "epoch": 2.43, + "learning_rate": 8.18941504178273e-06, + "loss": 0.3805, + "step": 455 + }, + { + "epoch": 2.43, + "learning_rate": 8.161559888579388e-06, + "loss": 0.3539, + "step": 456 + }, + { + "epoch": 2.44, + "learning_rate": 8.133704735376045e-06, + "loss": 0.3194, + "step": 457 + }, + { + "epoch": 2.44, + "learning_rate": 8.105849582172702e-06, + "loss": 0.3564, + "step": 458 + }, + { + "epoch": 2.45, + "learning_rate": 8.07799442896936e-06, + "loss": 0.2921, + "step": 459 + }, + { + "epoch": 2.45, + "learning_rate": 8.050139275766017e-06, + "loss": 0.3045, + "step": 460 + }, + { + "epoch": 2.46, + "learning_rate": 8.022284122562675e-06, + "loss": 0.3424, + "step": 461 + }, + { + "epoch": 2.46, + "learning_rate": 7.994428969359332e-06, + "loss": 0.3705, + "step": 462 + }, + { + "epoch": 2.47, + "learning_rate": 7.96657381615599e-06, + "loss": 0.3396, + "step": 463 + }, + { + "epoch": 2.47, + "learning_rate": 7.938718662952647e-06, + "loss": 0.3459, + "step": 464 + }, + { + "epoch": 2.48, + "learning_rate": 7.910863509749304e-06, + "loss": 0.3905, + "step": 465 + }, + { + "epoch": 2.49, + "learning_rate": 7.883008356545962e-06, + "loss": 0.3842, + "step": 466 + }, + { + "epoch": 2.49, + "learning_rate": 7.85515320334262e-06, + "loss": 0.3713, + "step": 467 + }, + { + "epoch": 2.5, + "learning_rate": 7.827298050139277e-06, + "loss": 0.3662, + "step": 468 + }, + { + "epoch": 2.5, + "learning_rate": 7.799442896935934e-06, + "loss": 0.3369, + "step": 469 + }, + { + "epoch": 2.51, + "learning_rate": 7.771587743732592e-06, + "loss": 0.371, + "step": 470 + }, + { + "epoch": 2.51, + "learning_rate": 7.743732590529249e-06, + "loss": 0.3712, + "step": 471 + }, + { + "epoch": 2.52, + "learning_rate": 7.715877437325906e-06, + "loss": 0.349, + "step": 472 + }, + { + "epoch": 2.52, + "learning_rate": 7.688022284122564e-06, + "loss": 0.3635, + "step": 473 + }, + { + "epoch": 2.53, + "learning_rate": 7.660167130919221e-06, + "loss": 0.3675, + "step": 474 + }, + { + "epoch": 2.53, + "learning_rate": 7.632311977715879e-06, + "loss": 0.3487, + "step": 475 + }, + { + "epoch": 2.54, + "learning_rate": 7.604456824512535e-06, + "loss": 0.4199, + "step": 476 + }, + { + "epoch": 2.54, + "learning_rate": 7.576601671309193e-06, + "loss": 0.3531, + "step": 477 + }, + { + "epoch": 2.55, + "learning_rate": 7.548746518105851e-06, + "loss": 0.3474, + "step": 478 + }, + { + "epoch": 2.55, + "learning_rate": 7.5208913649025075e-06, + "loss": 0.3476, + "step": 479 + }, + { + "epoch": 2.56, + "learning_rate": 7.493036211699165e-06, + "loss": 0.3841, + "step": 480 + }, + { + "epoch": 2.57, + "learning_rate": 7.465181058495823e-06, + "loss": 0.376, + "step": 481 + }, + { + "epoch": 2.57, + "learning_rate": 7.43732590529248e-06, + "loss": 0.3537, + "step": 482 + }, + { + "epoch": 2.58, + "learning_rate": 7.409470752089137e-06, + "loss": 0.335, + "step": 483 + }, + { + "epoch": 2.58, + "learning_rate": 7.381615598885794e-06, + "loss": 0.3475, + "step": 484 + }, + { + "epoch": 2.59, + "learning_rate": 7.353760445682452e-06, + "loss": 0.3528, + "step": 485 + }, + { + "epoch": 2.59, + "learning_rate": 7.3259052924791095e-06, + "loss": 0.3536, + "step": 486 + }, + { + "epoch": 2.6, + "learning_rate": 7.298050139275766e-06, + "loss": 0.4041, + "step": 487 + }, + { + "epoch": 2.6, + "learning_rate": 7.270194986072424e-06, + "loss": 0.3305, + "step": 488 + }, + { + "epoch": 2.61, + "learning_rate": 7.242339832869081e-06, + "loss": 0.3273, + "step": 489 + }, + { + "epoch": 2.61, + "learning_rate": 7.214484679665738e-06, + "loss": 0.3614, + "step": 490 + }, + { + "epoch": 2.62, + "learning_rate": 7.186629526462397e-06, + "loss": 0.3383, + "step": 491 + }, + { + "epoch": 2.62, + "learning_rate": 7.158774373259053e-06, + "loss": 0.3714, + "step": 492 + }, + { + "epoch": 2.63, + "learning_rate": 7.130919220055711e-06, + "loss": 0.3935, + "step": 493 + }, + { + "epoch": 2.63, + "learning_rate": 7.103064066852369e-06, + "loss": 0.3696, + "step": 494 + }, + { + "epoch": 2.64, + "learning_rate": 7.0752089136490255e-06, + "loss": 0.3823, + "step": 495 + }, + { + "epoch": 2.65, + "learning_rate": 7.047353760445683e-06, + "loss": 0.4174, + "step": 496 + }, + { + "epoch": 2.65, + "learning_rate": 7.0194986072423395e-06, + "loss": 0.3548, + "step": 497 + }, + { + "epoch": 2.66, + "learning_rate": 6.991643454038998e-06, + "loss": 0.3935, + "step": 498 + }, + { + "epoch": 2.66, + "learning_rate": 6.963788300835655e-06, + "loss": 0.327, + "step": 499 + }, + { + "epoch": 2.67, + "learning_rate": 6.935933147632312e-06, + "loss": 0.3624, + "step": 500 + }, + { + "epoch": 2.67, + "learning_rate": 6.90807799442897e-06, + "loss": 0.3433, + "step": 501 + }, + { + "epoch": 2.68, + "learning_rate": 6.8802228412256275e-06, + "loss": 0.4053, + "step": 502 + }, + { + "epoch": 2.68, + "learning_rate": 6.852367688022284e-06, + "loss": 0.3994, + "step": 503 + }, + { + "epoch": 2.69, + "learning_rate": 6.824512534818942e-06, + "loss": 0.3751, + "step": 504 + }, + { + "epoch": 2.69, + "learning_rate": 6.796657381615599e-06, + "loss": 0.3458, + "step": 505 + }, + { + "epoch": 2.7, + "learning_rate": 6.768802228412256e-06, + "loss": 0.3886, + "step": 506 + }, + { + "epoch": 2.7, + "learning_rate": 6.740947075208915e-06, + "loss": 0.3494, + "step": 507 + }, + { + "epoch": 2.71, + "learning_rate": 6.713091922005571e-06, + "loss": 0.3967, + "step": 508 + }, + { + "epoch": 2.71, + "learning_rate": 6.685236768802229e-06, + "loss": 0.3901, + "step": 509 + }, + { + "epoch": 2.72, + "learning_rate": 6.657381615598887e-06, + "loss": 0.3919, + "step": 510 + }, + { + "epoch": 2.73, + "learning_rate": 6.6295264623955435e-06, + "loss": 0.3908, + "step": 511 + }, + { + "epoch": 2.73, + "learning_rate": 6.601671309192201e-06, + "loss": 0.3783, + "step": 512 + }, + { + "epoch": 2.74, + "learning_rate": 6.573816155988858e-06, + "loss": 0.4016, + "step": 513 + }, + { + "epoch": 2.74, + "learning_rate": 6.545961002785516e-06, + "loss": 0.371, + "step": 514 + }, + { + "epoch": 2.75, + "learning_rate": 6.518105849582173e-06, + "loss": 0.3069, + "step": 515 + }, + { + "epoch": 2.75, + "learning_rate": 6.490250696378831e-06, + "loss": 0.3051, + "step": 516 + }, + { + "epoch": 2.76, + "learning_rate": 6.462395543175488e-06, + "loss": 0.345, + "step": 517 + }, + { + "epoch": 2.76, + "learning_rate": 6.434540389972146e-06, + "loss": 0.3786, + "step": 518 + }, + { + "epoch": 2.77, + "learning_rate": 6.406685236768803e-06, + "loss": 0.3217, + "step": 519 + }, + { + "epoch": 2.77, + "learning_rate": 6.37883008356546e-06, + "loss": 0.4105, + "step": 520 + }, + { + "epoch": 2.78, + "learning_rate": 6.350974930362117e-06, + "loss": 0.3169, + "step": 521 + }, + { + "epoch": 2.78, + "learning_rate": 6.323119777158775e-06, + "loss": 0.3546, + "step": 522 + }, + { + "epoch": 2.79, + "learning_rate": 6.295264623955433e-06, + "loss": 0.4003, + "step": 523 + }, + { + "epoch": 2.79, + "learning_rate": 6.267409470752089e-06, + "loss": 0.3012, + "step": 524 + }, + { + "epoch": 2.8, + "learning_rate": 6.2395543175487475e-06, + "loss": 0.3401, + "step": 525 + }, + { + "epoch": 2.81, + "learning_rate": 6.211699164345404e-06, + "loss": 0.339, + "step": 526 + }, + { + "epoch": 2.81, + "learning_rate": 6.1838440111420615e-06, + "loss": 0.3466, + "step": 527 + }, + { + "epoch": 2.82, + "learning_rate": 6.15598885793872e-06, + "loss": 0.2994, + "step": 528 + }, + { + "epoch": 2.82, + "learning_rate": 6.128133704735376e-06, + "loss": 0.3398, + "step": 529 + }, + { + "epoch": 2.83, + "learning_rate": 6.100278551532034e-06, + "loss": 0.3644, + "step": 530 + }, + { + "epoch": 2.83, + "learning_rate": 6.072423398328692e-06, + "loss": 0.408, + "step": 531 + }, + { + "epoch": 2.84, + "learning_rate": 6.044568245125349e-06, + "loss": 0.3911, + "step": 532 + }, + { + "epoch": 2.84, + "learning_rate": 6.016713091922006e-06, + "loss": 0.3311, + "step": 533 + }, + { + "epoch": 2.85, + "learning_rate": 5.988857938718663e-06, + "loss": 0.3588, + "step": 534 + }, + { + "epoch": 2.85, + "learning_rate": 5.961002785515321e-06, + "loss": 0.3387, + "step": 535 + }, + { + "epoch": 2.86, + "learning_rate": 5.933147632311978e-06, + "loss": 0.3304, + "step": 536 + }, + { + "epoch": 2.86, + "learning_rate": 5.905292479108635e-06, + "loss": 0.3468, + "step": 537 + }, + { + "epoch": 2.87, + "learning_rate": 5.877437325905293e-06, + "loss": 0.3943, + "step": 538 + }, + { + "epoch": 2.87, + "learning_rate": 5.849582172701951e-06, + "loss": 0.3757, + "step": 539 + }, + { + "epoch": 2.88, + "learning_rate": 5.821727019498607e-06, + "loss": 0.349, + "step": 540 + }, + { + "epoch": 2.89, + "learning_rate": 5.7938718662952654e-06, + "loss": 0.3019, + "step": 541 + }, + { + "epoch": 2.89, + "learning_rate": 5.766016713091922e-06, + "loss": 0.3786, + "step": 542 + }, + { + "epoch": 2.9, + "learning_rate": 5.7381615598885795e-06, + "loss": 0.2993, + "step": 543 + }, + { + "epoch": 2.9, + "learning_rate": 5.710306406685238e-06, + "loss": 0.3443, + "step": 544 + }, + { + "epoch": 2.91, + "learning_rate": 5.682451253481894e-06, + "loss": 0.2844, + "step": 545 + }, + { + "epoch": 2.91, + "learning_rate": 5.654596100278552e-06, + "loss": 0.3563, + "step": 546 + }, + { + "epoch": 2.92, + "learning_rate": 5.62674094707521e-06, + "loss": 0.3191, + "step": 547 + }, + { + "epoch": 2.92, + "learning_rate": 5.598885793871867e-06, + "loss": 0.3415, + "step": 548 + }, + { + "epoch": 2.93, + "learning_rate": 5.571030640668524e-06, + "loss": 0.3827, + "step": 549 + }, + { + "epoch": 2.93, + "learning_rate": 5.5431754874651814e-06, + "loss": 0.3499, + "step": 550 + }, + { + "epoch": 2.94, + "learning_rate": 5.515320334261839e-06, + "loss": 0.3983, + "step": 551 + }, + { + "epoch": 2.94, + "learning_rate": 5.487465181058496e-06, + "loss": 0.3044, + "step": 552 + }, + { + "epoch": 2.95, + "learning_rate": 5.459610027855154e-06, + "loss": 0.3838, + "step": 553 + }, + { + "epoch": 2.95, + "learning_rate": 5.431754874651811e-06, + "loss": 0.3466, + "step": 554 + }, + { + "epoch": 2.96, + "learning_rate": 5.403899721448468e-06, + "loss": 0.3695, + "step": 555 + }, + { + "epoch": 2.97, + "learning_rate": 5.376044568245126e-06, + "loss": 0.357, + "step": 556 + }, + { + "epoch": 2.97, + "learning_rate": 5.3481894150417834e-06, + "loss": 0.3939, + "step": 557 + }, + { + "epoch": 2.98, + "learning_rate": 5.32033426183844e-06, + "loss": 0.3431, + "step": 558 + }, + { + "epoch": 2.98, + "learning_rate": 5.292479108635098e-06, + "loss": 0.3656, + "step": 559 + }, + { + "epoch": 2.99, + "learning_rate": 5.264623955431756e-06, + "loss": 0.3198, + "step": 560 + }, + { + "epoch": 2.99, + "learning_rate": 5.236768802228412e-06, + "loss": 0.3467, + "step": 561 + }, + { + "epoch": 3.0, + "learning_rate": 5.2089136490250706e-06, + "loss": 0.3622, + "step": 562 + }, + { + "epoch": 3.0, + "learning_rate": 5.181058495821727e-06, + "loss": 0.2611, + "step": 563 + }, + { + "epoch": 3.01, + "learning_rate": 5.1532033426183846e-06, + "loss": 0.2843, + "step": 564 + }, + { + "epoch": 3.01, + "learning_rate": 5.125348189415043e-06, + "loss": 0.2479, + "step": 565 + }, + { + "epoch": 3.02, + "learning_rate": 5.097493036211699e-06, + "loss": 0.2366, + "step": 566 + }, + { + "epoch": 3.02, + "learning_rate": 5.069637883008357e-06, + "loss": 0.2366, + "step": 567 + }, + { + "epoch": 3.03, + "learning_rate": 5.041782729805015e-06, + "loss": 0.2106, + "step": 568 + }, + { + "epoch": 3.03, + "learning_rate": 5.013927576601672e-06, + "loss": 0.293, + "step": 569 + }, + { + "epoch": 3.04, + "learning_rate": 4.986072423398329e-06, + "loss": 0.2746, + "step": 570 + }, + { + "epoch": 3.05, + "learning_rate": 4.9582172701949865e-06, + "loss": 0.2579, + "step": 571 + }, + { + "epoch": 3.05, + "learning_rate": 4.930362116991644e-06, + "loss": 0.2158, + "step": 572 + }, + { + "epoch": 3.06, + "learning_rate": 4.902506963788301e-06, + "loss": 0.1933, + "step": 573 + }, + { + "epoch": 3.06, + "learning_rate": 4.874651810584959e-06, + "loss": 0.2546, + "step": 574 + }, + { + "epoch": 3.07, + "learning_rate": 4.846796657381616e-06, + "loss": 0.2556, + "step": 575 + }, + { + "epoch": 3.07, + "learning_rate": 4.818941504178274e-06, + "loss": 0.2603, + "step": 576 + }, + { + "epoch": 3.08, + "learning_rate": 4.79108635097493e-06, + "loss": 0.2001, + "step": 577 + }, + { + "epoch": 3.08, + "learning_rate": 4.7632311977715885e-06, + "loss": 0.2026, + "step": 578 + }, + { + "epoch": 3.09, + "learning_rate": 4.735376044568246e-06, + "loss": 0.2431, + "step": 579 + }, + { + "epoch": 3.09, + "learning_rate": 4.7075208913649025e-06, + "loss": 0.245, + "step": 580 + }, + { + "epoch": 3.1, + "learning_rate": 4.67966573816156e-06, + "loss": 0.2428, + "step": 581 + }, + { + "epoch": 3.1, + "learning_rate": 4.651810584958218e-06, + "loss": 0.2807, + "step": 582 + }, + { + "epoch": 3.11, + "learning_rate": 4.623955431754875e-06, + "loss": 0.2473, + "step": 583 + }, + { + "epoch": 3.11, + "learning_rate": 4.596100278551532e-06, + "loss": 0.304, + "step": 584 + }, + { + "epoch": 3.12, + "learning_rate": 4.56824512534819e-06, + "loss": 0.3036, + "step": 585 + }, + { + "epoch": 3.13, + "learning_rate": 4.540389972144847e-06, + "loss": 0.2238, + "step": 586 + }, + { + "epoch": 3.13, + "learning_rate": 4.5125348189415045e-06, + "loss": 0.2459, + "step": 587 + }, + { + "epoch": 3.14, + "learning_rate": 4.484679665738162e-06, + "loss": 0.1955, + "step": 588 + }, + { + "epoch": 3.14, + "learning_rate": 4.456824512534819e-06, + "loss": 0.2734, + "step": 589 + }, + { + "epoch": 3.15, + "learning_rate": 4.428969359331477e-06, + "loss": 0.2265, + "step": 590 + }, + { + "epoch": 3.15, + "learning_rate": 4.401114206128134e-06, + "loss": 0.2692, + "step": 591 + }, + { + "epoch": 3.16, + "learning_rate": 4.373259052924792e-06, + "loss": 0.2346, + "step": 592 + }, + { + "epoch": 3.16, + "learning_rate": 4.345403899721449e-06, + "loss": 0.2454, + "step": 593 + }, + { + "epoch": 3.17, + "learning_rate": 4.317548746518106e-06, + "loss": 0.2247, + "step": 594 + }, + { + "epoch": 3.17, + "learning_rate": 4.289693593314764e-06, + "loss": 0.2264, + "step": 595 + }, + { + "epoch": 3.18, + "learning_rate": 4.261838440111421e-06, + "loss": 0.2006, + "step": 596 + }, + { + "epoch": 3.18, + "learning_rate": 4.233983286908078e-06, + "loss": 0.2368, + "step": 597 + }, + { + "epoch": 3.19, + "learning_rate": 4.206128133704735e-06, + "loss": 0.2508, + "step": 598 + }, + { + "epoch": 3.19, + "learning_rate": 4.178272980501394e-06, + "loss": 0.2414, + "step": 599 + }, + { + "epoch": 3.2, + "learning_rate": 4.15041782729805e-06, + "loss": 0.1978, + "step": 600 + }, + { + "epoch": 3.21, + "learning_rate": 4.122562674094708e-06, + "loss": 0.2219, + "step": 601 + }, + { + "epoch": 3.21, + "learning_rate": 4.094707520891365e-06, + "loss": 0.2405, + "step": 602 + }, + { + "epoch": 3.22, + "learning_rate": 4.0668523676880225e-06, + "loss": 0.2922, + "step": 603 + }, + { + "epoch": 3.22, + "learning_rate": 4.03899721448468e-06, + "loss": 0.2016, + "step": 604 + }, + { + "epoch": 3.23, + "learning_rate": 4.011142061281337e-06, + "loss": 0.2181, + "step": 605 + }, + { + "epoch": 3.23, + "learning_rate": 3.983286908077995e-06, + "loss": 0.2731, + "step": 606 + }, + { + "epoch": 3.24, + "learning_rate": 3.955431754874652e-06, + "loss": 0.2475, + "step": 607 + }, + { + "epoch": 3.24, + "learning_rate": 3.92757660167131e-06, + "loss": 0.2983, + "step": 608 + }, + { + "epoch": 3.25, + "learning_rate": 3.899721448467967e-06, + "loss": 0.2052, + "step": 609 + }, + { + "epoch": 3.25, + "learning_rate": 3.8718662952646245e-06, + "loss": 0.2338, + "step": 610 + }, + { + "epoch": 3.26, + "learning_rate": 3.844011142061282e-06, + "loss": 0.2706, + "step": 611 + }, + { + "epoch": 3.26, + "learning_rate": 3.816155988857939e-06, + "loss": 0.2157, + "step": 612 + }, + { + "epoch": 3.27, + "learning_rate": 3.7883008356545963e-06, + "loss": 0.2592, + "step": 613 + }, + { + "epoch": 3.27, + "learning_rate": 3.7604456824512538e-06, + "loss": 0.2452, + "step": 614 + }, + { + "epoch": 3.28, + "learning_rate": 3.7325905292479116e-06, + "loss": 0.23, + "step": 615 + }, + { + "epoch": 3.29, + "learning_rate": 3.7047353760445686e-06, + "loss": 0.2635, + "step": 616 + }, + { + "epoch": 3.29, + "learning_rate": 3.676880222841226e-06, + "loss": 0.2549, + "step": 617 + }, + { + "epoch": 3.3, + "learning_rate": 3.649025069637883e-06, + "loss": 0.2277, + "step": 618 + }, + { + "epoch": 3.3, + "learning_rate": 3.6211699164345405e-06, + "loss": 0.2248, + "step": 619 + }, + { + "epoch": 3.31, + "learning_rate": 3.5933147632311983e-06, + "loss": 0.272, + "step": 620 + }, + { + "epoch": 3.31, + "learning_rate": 3.5654596100278553e-06, + "loss": 0.2504, + "step": 621 + }, + { + "epoch": 3.32, + "learning_rate": 3.5376044568245128e-06, + "loss": 0.2661, + "step": 622 + }, + { + "epoch": 3.32, + "learning_rate": 3.5097493036211698e-06, + "loss": 0.2377, + "step": 623 + }, + { + "epoch": 3.33, + "learning_rate": 3.4818941504178276e-06, + "loss": 0.2211, + "step": 624 + }, + { + "epoch": 3.33, + "learning_rate": 3.454038997214485e-06, + "loss": 0.2328, + "step": 625 + }, + { + "epoch": 3.34, + "learning_rate": 3.426183844011142e-06, + "loss": 0.2354, + "step": 626 + }, + { + "epoch": 3.34, + "learning_rate": 3.3983286908077995e-06, + "loss": 0.273, + "step": 627 + }, + { + "epoch": 3.35, + "learning_rate": 3.3704735376044573e-06, + "loss": 0.2679, + "step": 628 + }, + { + "epoch": 3.35, + "learning_rate": 3.3426183844011143e-06, + "loss": 0.2545, + "step": 629 + }, + { + "epoch": 3.36, + "learning_rate": 3.3147632311977717e-06, + "loss": 0.2604, + "step": 630 + }, + { + "epoch": 3.37, + "learning_rate": 3.286908077994429e-06, + "loss": 0.2634, + "step": 631 + }, + { + "epoch": 3.37, + "learning_rate": 3.2590529247910866e-06, + "loss": 0.232, + "step": 632 + }, + { + "epoch": 3.38, + "learning_rate": 3.231197771587744e-06, + "loss": 0.2282, + "step": 633 + }, + { + "epoch": 3.38, + "learning_rate": 3.2033426183844015e-06, + "loss": 0.2013, + "step": 634 + }, + { + "epoch": 3.39, + "learning_rate": 3.1754874651810585e-06, + "loss": 0.2912, + "step": 635 + }, + { + "epoch": 3.39, + "learning_rate": 3.1476323119777163e-06, + "loss": 0.2545, + "step": 636 + }, + { + "epoch": 3.4, + "learning_rate": 3.1197771587743737e-06, + "loss": 0.233, + "step": 637 + }, + { + "epoch": 3.4, + "learning_rate": 3.0919220055710307e-06, + "loss": 0.2865, + "step": 638 + }, + { + "epoch": 3.41, + "learning_rate": 3.064066852367688e-06, + "loss": 0.2219, + "step": 639 + }, + { + "epoch": 3.41, + "learning_rate": 3.036211699164346e-06, + "loss": 0.2181, + "step": 640 + }, + { + "epoch": 3.42, + "learning_rate": 3.008356545961003e-06, + "loss": 0.2502, + "step": 641 + }, + { + "epoch": 3.42, + "learning_rate": 2.9805013927576604e-06, + "loss": 0.2627, + "step": 642 + }, + { + "epoch": 3.43, + "learning_rate": 2.9526462395543174e-06, + "loss": 0.2908, + "step": 643 + }, + { + "epoch": 3.43, + "learning_rate": 2.9247910863509753e-06, + "loss": 0.2489, + "step": 644 + }, + { + "epoch": 3.44, + "learning_rate": 2.8969359331476327e-06, + "loss": 0.2714, + "step": 645 + }, + { + "epoch": 3.45, + "learning_rate": 2.8690807799442897e-06, + "loss": 0.2692, + "step": 646 + }, + { + "epoch": 3.45, + "learning_rate": 2.841225626740947e-06, + "loss": 0.263, + "step": 647 + }, + { + "epoch": 3.46, + "learning_rate": 2.813370473537605e-06, + "loss": 0.2875, + "step": 648 + }, + { + "epoch": 3.46, + "learning_rate": 2.785515320334262e-06, + "loss": 0.2266, + "step": 649 + }, + { + "epoch": 3.47, + "learning_rate": 2.7576601671309194e-06, + "loss": 0.2147, + "step": 650 + }, + { + "epoch": 3.47, + "learning_rate": 2.729805013927577e-06, + "loss": 0.2353, + "step": 651 + }, + { + "epoch": 3.48, + "learning_rate": 2.701949860724234e-06, + "loss": 0.2991, + "step": 652 + }, + { + "epoch": 3.48, + "learning_rate": 2.6740947075208917e-06, + "loss": 0.1777, + "step": 653 + }, + { + "epoch": 3.49, + "learning_rate": 2.646239554317549e-06, + "loss": 0.2654, + "step": 654 + }, + { + "epoch": 3.49, + "learning_rate": 2.618384401114206e-06, + "loss": 0.249, + "step": 655 + }, + { + "epoch": 3.5, + "learning_rate": 2.5905292479108636e-06, + "loss": 0.1989, + "step": 656 + }, + { + "epoch": 3.5, + "learning_rate": 2.5626740947075214e-06, + "loss": 0.2011, + "step": 657 + }, + { + "epoch": 3.51, + "learning_rate": 2.5348189415041784e-06, + "loss": 0.1827, + "step": 658 + }, + { + "epoch": 3.51, + "learning_rate": 2.506963788300836e-06, + "loss": 0.2503, + "step": 659 + }, + { + "epoch": 3.52, + "learning_rate": 2.4791086350974933e-06, + "loss": 0.2686, + "step": 660 + }, + { + "epoch": 3.53, + "learning_rate": 2.4512534818941507e-06, + "loss": 0.2702, + "step": 661 + }, + { + "epoch": 3.53, + "learning_rate": 2.423398328690808e-06, + "loss": 0.2688, + "step": 662 + }, + { + "epoch": 3.54, + "learning_rate": 2.395543175487465e-06, + "loss": 0.2626, + "step": 663 + }, + { + "epoch": 3.54, + "learning_rate": 2.367688022284123e-06, + "loss": 0.2624, + "step": 664 + }, + { + "epoch": 3.55, + "learning_rate": 2.33983286908078e-06, + "loss": 0.2556, + "step": 665 + }, + { + "epoch": 3.55, + "learning_rate": 2.3119777158774374e-06, + "loss": 0.2348, + "step": 666 + }, + { + "epoch": 3.56, + "learning_rate": 2.284122562674095e-06, + "loss": 0.2589, + "step": 667 + }, + { + "epoch": 3.56, + "learning_rate": 2.2562674094707523e-06, + "loss": 0.2076, + "step": 668 + }, + { + "epoch": 3.57, + "learning_rate": 2.2284122562674097e-06, + "loss": 0.2367, + "step": 669 + }, + { + "epoch": 3.57, + "learning_rate": 2.200557103064067e-06, + "loss": 0.2426, + "step": 670 + }, + { + "epoch": 3.58, + "learning_rate": 2.1727019498607245e-06, + "loss": 0.2237, + "step": 671 + }, + { + "epoch": 3.58, + "learning_rate": 2.144846796657382e-06, + "loss": 0.2402, + "step": 672 + }, + { + "epoch": 3.59, + "learning_rate": 2.116991643454039e-06, + "loss": 0.2085, + "step": 673 + }, + { + "epoch": 3.59, + "learning_rate": 2.089136490250697e-06, + "loss": 0.2371, + "step": 674 + }, + { + "epoch": 3.6, + "learning_rate": 2.061281337047354e-06, + "loss": 0.3147, + "step": 675 + }, + { + "epoch": 3.61, + "learning_rate": 2.0334261838440113e-06, + "loss": 0.238, + "step": 676 + }, + { + "epoch": 3.61, + "learning_rate": 2.0055710306406687e-06, + "loss": 0.2007, + "step": 677 + }, + { + "epoch": 3.62, + "learning_rate": 1.977715877437326e-06, + "loss": 0.2041, + "step": 678 + }, + { + "epoch": 3.62, + "learning_rate": 1.9498607242339835e-06, + "loss": 0.2376, + "step": 679 + }, + { + "epoch": 3.63, + "learning_rate": 1.922005571030641e-06, + "loss": 0.2105, + "step": 680 + }, + { + "epoch": 3.63, + "learning_rate": 1.8941504178272982e-06, + "loss": 0.2807, + "step": 681 + }, + { + "epoch": 3.64, + "learning_rate": 1.8662952646239558e-06, + "loss": 0.2387, + "step": 682 + }, + { + "epoch": 3.64, + "learning_rate": 1.838440111420613e-06, + "loss": 0.2111, + "step": 683 + }, + { + "epoch": 3.65, + "learning_rate": 1.8105849582172702e-06, + "loss": 0.2102, + "step": 684 + }, + { + "epoch": 3.65, + "learning_rate": 1.7827298050139277e-06, + "loss": 0.1885, + "step": 685 + }, + { + "epoch": 3.66, + "learning_rate": 1.7548746518105849e-06, + "loss": 0.2013, + "step": 686 + }, + { + "epoch": 3.66, + "learning_rate": 1.7270194986072425e-06, + "loss": 0.299, + "step": 687 + }, + { + "epoch": 3.67, + "learning_rate": 1.6991643454038997e-06, + "loss": 0.25, + "step": 688 + }, + { + "epoch": 3.67, + "learning_rate": 1.6713091922005572e-06, + "loss": 0.2589, + "step": 689 + }, + { + "epoch": 3.68, + "learning_rate": 1.6434540389972146e-06, + "loss": 0.2805, + "step": 690 + }, + { + "epoch": 3.69, + "learning_rate": 1.615598885793872e-06, + "loss": 0.252, + "step": 691 + }, + { + "epoch": 3.69, + "learning_rate": 1.5877437325905292e-06, + "loss": 0.2176, + "step": 692 + }, + { + "epoch": 3.7, + "learning_rate": 1.5598885793871869e-06, + "loss": 0.2074, + "step": 693 + }, + { + "epoch": 3.7, + "learning_rate": 1.532033426183844e-06, + "loss": 0.2676, + "step": 694 + }, + { + "epoch": 3.71, + "learning_rate": 1.5041782729805015e-06, + "loss": 0.2096, + "step": 695 + }, + { + "epoch": 3.71, + "learning_rate": 1.4763231197771587e-06, + "loss": 0.2257, + "step": 696 + }, + { + "epoch": 3.72, + "learning_rate": 1.4484679665738164e-06, + "loss": 0.2439, + "step": 697 + }, + { + "epoch": 3.72, + "learning_rate": 1.4206128133704736e-06, + "loss": 0.2576, + "step": 698 + }, + { + "epoch": 3.73, + "learning_rate": 1.392757660167131e-06, + "loss": 0.1786, + "step": 699 + }, + { + "epoch": 3.73, + "learning_rate": 1.3649025069637884e-06, + "loss": 0.2873, + "step": 700 + }, + { + "epoch": 3.74, + "learning_rate": 1.3370473537604459e-06, + "loss": 0.222, + "step": 701 + }, + { + "epoch": 3.74, + "learning_rate": 1.309192200557103e-06, + "loss": 0.2331, + "step": 702 + }, + { + "epoch": 3.75, + "learning_rate": 1.2813370473537607e-06, + "loss": 0.2459, + "step": 703 + }, + { + "epoch": 3.75, + "learning_rate": 1.253481894150418e-06, + "loss": 0.2322, + "step": 704 + }, + { + "epoch": 3.76, + "learning_rate": 1.2256267409470754e-06, + "loss": 0.2691, + "step": 705 + }, + { + "epoch": 3.77, + "learning_rate": 1.1977715877437326e-06, + "loss": 0.1968, + "step": 706 + }, + { + "epoch": 3.77, + "learning_rate": 1.16991643454039e-06, + "loss": 0.238, + "step": 707 + }, + { + "epoch": 3.78, + "learning_rate": 1.1420612813370474e-06, + "loss": 0.3004, + "step": 708 + }, + { + "epoch": 3.78, + "learning_rate": 1.1142061281337048e-06, + "loss": 0.2753, + "step": 709 + }, + { + "epoch": 3.79, + "learning_rate": 1.0863509749303623e-06, + "loss": 0.2238, + "step": 710 + }, + { + "epoch": 3.79, + "learning_rate": 1.0584958217270195e-06, + "loss": 0.2687, + "step": 711 + }, + { + "epoch": 3.8, + "learning_rate": 1.030640668523677e-06, + "loss": 0.2211, + "step": 712 + }, + { + "epoch": 3.8, + "learning_rate": 1.0027855153203343e-06, + "loss": 0.2095, + "step": 713 + }, + { + "epoch": 3.81, + "learning_rate": 9.749303621169918e-07, + "loss": 0.2233, + "step": 714 + }, + { + "epoch": 3.81, + "learning_rate": 9.470752089136491e-07, + "loss": 0.209, + "step": 715 + }, + { + "epoch": 3.82, + "learning_rate": 9.192200557103065e-07, + "loss": 0.2595, + "step": 716 + }, + { + "epoch": 3.82, + "learning_rate": 8.913649025069638e-07, + "loss": 0.2651, + "step": 717 + }, + { + "epoch": 3.83, + "learning_rate": 8.635097493036213e-07, + "loss": 0.3013, + "step": 718 + }, + { + "epoch": 3.83, + "learning_rate": 8.356545961002786e-07, + "loss": 0.1956, + "step": 719 + }, + { + "epoch": 3.84, + "learning_rate": 8.07799442896936e-07, + "loss": 0.2065, + "step": 720 + }, + { + "epoch": 3.85, + "learning_rate": 7.799442896935934e-07, + "loss": 0.2818, + "step": 721 + }, + { + "epoch": 3.85, + "learning_rate": 7.520891364902508e-07, + "loss": 0.2413, + "step": 722 + }, + { + "epoch": 3.86, + "learning_rate": 7.242339832869082e-07, + "loss": 0.2346, + "step": 723 + }, + { + "epoch": 3.86, + "learning_rate": 6.963788300835655e-07, + "loss": 0.2072, + "step": 724 + }, + { + "epoch": 3.87, + "learning_rate": 6.685236768802229e-07, + "loss": 0.2383, + "step": 725 + }, + { + "epoch": 3.87, + "learning_rate": 6.406685236768804e-07, + "loss": 0.2462, + "step": 726 + }, + { + "epoch": 3.88, + "learning_rate": 6.128133704735377e-07, + "loss": 0.2599, + "step": 727 + }, + { + "epoch": 3.88, + "learning_rate": 5.84958217270195e-07, + "loss": 0.2566, + "step": 728 + }, + { + "epoch": 3.89, + "learning_rate": 5.571030640668524e-07, + "loss": 0.2654, + "step": 729 + }, + { + "epoch": 3.89, + "learning_rate": 5.292479108635097e-07, + "loss": 0.2149, + "step": 730 + }, + { + "epoch": 3.9, + "learning_rate": 5.013927576601672e-07, + "loss": 0.2274, + "step": 731 + }, + { + "epoch": 3.9, + "learning_rate": 4.7353760445682454e-07, + "loss": 0.2595, + "step": 732 + }, + { + "epoch": 3.91, + "learning_rate": 4.456824512534819e-07, + "loss": 0.2085, + "step": 733 + }, + { + "epoch": 3.91, + "learning_rate": 4.178272980501393e-07, + "loss": 0.2419, + "step": 734 + }, + { + "epoch": 3.92, + "learning_rate": 3.899721448467967e-07, + "loss": 0.2721, + "step": 735 + }, + { + "epoch": 3.93, + "learning_rate": 3.621169916434541e-07, + "loss": 0.241, + "step": 736 + }, + { + "epoch": 3.93, + "learning_rate": 3.3426183844011146e-07, + "loss": 0.2546, + "step": 737 + }, + { + "epoch": 3.94, + "learning_rate": 3.0640668523676884e-07, + "loss": 0.1945, + "step": 738 + }, + { + "epoch": 3.94, + "learning_rate": 2.785515320334262e-07, + "loss": 0.2452, + "step": 739 + }, + { + "epoch": 3.95, + "learning_rate": 2.506963788300836e-07, + "loss": 0.2698, + "step": 740 + }, + { + "epoch": 3.95, + "learning_rate": 2.2284122562674096e-07, + "loss": 0.2505, + "step": 741 + }, + { + "epoch": 3.96, + "learning_rate": 1.9498607242339836e-07, + "loss": 0.2346, + "step": 742 + }, + { + "epoch": 3.96, + "learning_rate": 1.6713091922005573e-07, + "loss": 0.2696, + "step": 743 + }, + { + "epoch": 3.97, + "learning_rate": 1.392757660167131e-07, + "loss": 0.2145, + "step": 744 + }, + { + "epoch": 3.97, + "learning_rate": 1.1142061281337048e-07, + "loss": 0.2884, + "step": 745 + }, + { + "epoch": 3.98, + "learning_rate": 8.356545961002787e-08, + "loss": 0.2469, + "step": 746 + }, + { + "epoch": 3.98, + "learning_rate": 5.571030640668524e-08, + "loss": 0.2127, + "step": 747 + }, + { + "epoch": 3.99, + "learning_rate": 2.785515320334262e-08, + "loss": 0.2353, + "step": 748 + }, + { + "epoch": 3.99, + "step": 748, + "total_flos": 313179619983360.0, + "train_loss": 0.4646715870236649, + "train_runtime": 51945.5375, + "train_samples_per_second": 1.848, + "train_steps_per_second": 0.014 + } + ], + "logging_steps": 1.0, + "max_steps": 748, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 200, + "total_flos": 313179619983360.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..2aad468 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfadb548dd5b3e145a7ff2af886a5e06b321b88fbd918259d632606c53006ee1 +size 7160