commit ea2fef5908d532c23672924008267eaffb8ba325 Author: ModelHub XC Date: Fri Jun 12 17:02:18 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: shuoxing/llama3-8b-full-pretrain-wash-c4-0-3m-sft-bs64 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..87f9c21 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +library_name: transformers +license: llama3 +base_model: shuoxing/llama3-8b-full-pretrain-wash-c4-0-3m-bs4 +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: llama3-8b-full-pretrain-wash-c4-0-3m-sft-bs64 + results: [] +--- + + + +# llama3-8b-full-pretrain-wash-c4-0-3m-sft-bs64 + +This model is a fine-tuned version of [shuoxing/llama3-8b-full-pretrain-wash-c4-0-3m-bs4](https://huggingface.co/shuoxing/llama3-8b-full-pretrain-wash-c4-0-3m-bs4) on the alpaca_en dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 8 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 64 +- total_eval_batch_size: 32 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 0.1 +- num_epochs: 3.0 + +### Training results + + + +### Framework versions + +- Transformers 5.2.0 +- Pytorch 2.6.0+cu124 +- Datasets 4.0.0 +- Tokenizers 0.22.2 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..94873e1 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 4888319754240.0, + "train_loss": 0.7582035779575759, + "train_runtime": 573.8365, + "train_samples_per_second": 26.14, + "train_steps_per_second": 0.413 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..39bd0c9 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..a5b8bc3 --- /dev/null +++ b/config.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "rope_theta": 500000.0, + "rope_type": "default" + }, + "tie_word_embeddings": false, + "transformers_version": "5.2.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..eb23973 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,13 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128009 + ], + "max_length": 4096, + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.2.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..825d04b --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff1132ce243c3a19926330aaa487d7c15e5e32512fef58147e67720fb46e37d +size 16060556616 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1723111 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 +size 17210148 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..1bba0e4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,19 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": [ + "<|eom_id|>" + ], + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "TokenizersBackend" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..94873e1 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 4888319754240.0, + "train_loss": 0.7582035779575759, + "train_runtime": 573.8365, + "train_samples_per_second": 26.14, + "train_steps_per_second": 0.413 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..51b5097 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,238 @@ +{"current_steps": 1, "total_steps": 237, "loss": 2.2484302520751953, "lr": 0.0, "epoch": 0.012738853503184714, "percentage": 0.42, "elapsed_time": "0:00:04", "remaining_time": "0:18:51"} +{"current_steps": 2, "total_steps": 237, "loss": 2.215416193008423, "lr": 4.1666666666666667e-07, "epoch": 0.025477707006369428, "percentage": 0.84, "elapsed_time": "0:00:06", "remaining_time": "0:13:24"} +{"current_steps": 3, "total_steps": 237, "loss": 2.1969661712646484, "lr": 8.333333333333333e-07, "epoch": 0.03821656050955414, "percentage": 1.27, "elapsed_time": "0:00:08", "remaining_time": "0:11:35"} +{"current_steps": 4, "total_steps": 237, "loss": 2.1304638385772705, "lr": 1.25e-06, "epoch": 0.050955414012738856, "percentage": 1.69, "elapsed_time": "0:00:11", "remaining_time": "0:11:01"} +{"current_steps": 5, "total_steps": 237, "loss": 2.321625232696533, "lr": 1.6666666666666667e-06, "epoch": 0.06369426751592357, "percentage": 2.11, "elapsed_time": "0:00:13", "remaining_time": "0:10:31"} +{"current_steps": 6, "total_steps": 237, "loss": 2.1303162574768066, "lr": 2.0833333333333334e-06, "epoch": 0.07643312101910828, "percentage": 2.53, "elapsed_time": "0:00:15", "remaining_time": "0:10:03"} +{"current_steps": 7, "total_steps": 237, "loss": 1.8076802492141724, "lr": 2.5e-06, "epoch": 0.08917197452229299, "percentage": 2.95, "elapsed_time": "0:00:18", "remaining_time": "0:10:03"} +{"current_steps": 8, "total_steps": 237, "loss": 1.7900886535644531, "lr": 2.916666666666667e-06, "epoch": 0.10191082802547771, "percentage": 3.38, "elapsed_time": "0:00:20", "remaining_time": "0:09:45"} +{"current_steps": 9, "total_steps": 237, "loss": 1.7915903329849243, "lr": 3.3333333333333333e-06, "epoch": 0.11464968152866242, "percentage": 3.8, "elapsed_time": "0:00:22", "remaining_time": "0:09:31"} +{"current_steps": 10, "total_steps": 237, "loss": 1.8171511888504028, "lr": 3.7500000000000005e-06, "epoch": 0.12738853503184713, "percentage": 4.22, "elapsed_time": "0:00:24", "remaining_time": "0:09:18"} +{"current_steps": 11, "total_steps": 237, "loss": 1.7455570697784424, "lr": 4.166666666666667e-06, "epoch": 0.14012738853503184, "percentage": 4.64, "elapsed_time": "0:00:26", "remaining_time": "0:09:09"} +{"current_steps": 12, "total_steps": 237, "loss": 1.6337864398956299, "lr": 4.583333333333333e-06, "epoch": 0.15286624203821655, "percentage": 5.06, "elapsed_time": "0:00:28", "remaining_time": "0:09:02"} +{"current_steps": 13, "total_steps": 237, "loss": 1.778015375137329, "lr": 5e-06, "epoch": 0.16560509554140126, "percentage": 5.49, "elapsed_time": "0:00:30", "remaining_time": "0:08:53"} +{"current_steps": 14, "total_steps": 237, "loss": 1.712306022644043, "lr": 5.416666666666667e-06, "epoch": 0.17834394904458598, "percentage": 5.91, "elapsed_time": "0:00:33", "remaining_time": "0:08:46"} +{"current_steps": 15, "total_steps": 237, "loss": 1.5768513679504395, "lr": 5.833333333333334e-06, "epoch": 0.1910828025477707, "percentage": 6.33, "elapsed_time": "0:00:35", "remaining_time": "0:08:39"} +{"current_steps": 16, "total_steps": 237, "loss": 1.5611257553100586, "lr": 6.25e-06, "epoch": 0.20382165605095542, "percentage": 6.75, "elapsed_time": "0:00:37", "remaining_time": "0:08:34"} +{"current_steps": 17, "total_steps": 237, "loss": 1.5967652797698975, "lr": 6.666666666666667e-06, "epoch": 0.21656050955414013, "percentage": 7.17, "elapsed_time": "0:00:39", "remaining_time": "0:08:28"} +{"current_steps": 18, "total_steps": 237, "loss": 1.364829659461975, "lr": 7.083333333333335e-06, "epoch": 0.22929936305732485, "percentage": 7.59, "elapsed_time": "0:00:41", "remaining_time": "0:08:22"} +{"current_steps": 19, "total_steps": 237, "loss": 1.6430319547653198, "lr": 7.500000000000001e-06, "epoch": 0.24203821656050956, "percentage": 8.02, "elapsed_time": "0:00:43", "remaining_time": "0:08:18"} +{"current_steps": 20, "total_steps": 237, "loss": 1.5467270612716675, "lr": 7.916666666666667e-06, "epoch": 0.25477707006369427, "percentage": 8.44, "elapsed_time": "0:00:45", "remaining_time": "0:08:13"} +{"current_steps": 21, "total_steps": 237, "loss": 1.5446631908416748, "lr": 8.333333333333334e-06, "epoch": 0.267515923566879, "percentage": 8.86, "elapsed_time": "0:00:47", "remaining_time": "0:08:10"} +{"current_steps": 22, "total_steps": 237, "loss": 1.4481780529022217, "lr": 8.750000000000001e-06, "epoch": 0.2802547770700637, "percentage": 9.28, "elapsed_time": "0:00:49", "remaining_time": "0:08:07"} +{"current_steps": 23, "total_steps": 237, "loss": 1.4933228492736816, "lr": 9.166666666666666e-06, "epoch": 0.2929936305732484, "percentage": 9.7, "elapsed_time": "0:00:51", "remaining_time": "0:08:02"} +{"current_steps": 24, "total_steps": 237, "loss": 1.5807710886001587, "lr": 9.583333333333335e-06, "epoch": 0.3057324840764331, "percentage": 10.13, "elapsed_time": "0:00:53", "remaining_time": "0:07:58"} +{"current_steps": 25, "total_steps": 237, "loss": 1.349104404449463, "lr": 1e-05, "epoch": 0.3184713375796178, "percentage": 10.55, "elapsed_time": "0:00:56", "remaining_time": "0:07:56"} +{"current_steps": 26, "total_steps": 237, "loss": 1.4354019165039062, "lr": 9.999456158087994e-06, "epoch": 0.33121019108280253, "percentage": 10.97, "elapsed_time": "0:00:58", "remaining_time": "0:07:53"} +{"current_steps": 27, "total_steps": 237, "loss": 1.566201090812683, "lr": 9.997824750657586e-06, "epoch": 0.34394904458598724, "percentage": 11.39, "elapsed_time": "0:01:00", "remaining_time": "0:07:50"} +{"current_steps": 28, "total_steps": 237, "loss": 1.411285161972046, "lr": 9.995106132599869e-06, "epoch": 0.35668789808917195, "percentage": 11.81, "elapsed_time": "0:01:02", "remaining_time": "0:07:47"} +{"current_steps": 29, "total_steps": 237, "loss": 1.2867789268493652, "lr": 9.99130089531422e-06, "epoch": 0.36942675159235666, "percentage": 12.24, "elapsed_time": "0:01:04", "remaining_time": "0:07:46"} +{"current_steps": 30, "total_steps": 237, "loss": 1.5999436378479004, "lr": 9.98640986657965e-06, "epoch": 0.3821656050955414, "percentage": 12.66, "elapsed_time": "0:01:07", "remaining_time": "0:07:42"} +{"current_steps": 31, "total_steps": 237, "loss": 1.4318150281906128, "lr": 9.980434110374725e-06, "epoch": 0.39490445859872614, "percentage": 13.08, "elapsed_time": "0:01:09", "remaining_time": "0:07:39"} +{"current_steps": 32, "total_steps": 237, "loss": 1.607371211051941, "lr": 9.973374926646117e-06, "epoch": 0.40764331210191085, "percentage": 13.5, "elapsed_time": "0:01:11", "remaining_time": "0:07:36"} +{"current_steps": 33, "total_steps": 237, "loss": 1.443784236907959, "lr": 9.965233851025816e-06, "epoch": 0.42038216560509556, "percentage": 13.92, "elapsed_time": "0:01:13", "remaining_time": "0:07:33"} +{"current_steps": 34, "total_steps": 237, "loss": 1.570559024810791, "lr": 9.956012654497073e-06, "epoch": 0.43312101910828027, "percentage": 14.35, "elapsed_time": "0:01:15", "remaining_time": "0:07:30"} +{"current_steps": 35, "total_steps": 237, "loss": 1.548865556716919, "lr": 9.945713343009154e-06, "epoch": 0.445859872611465, "percentage": 14.77, "elapsed_time": "0:01:17", "remaining_time": "0:07:27"} +{"current_steps": 36, "total_steps": 237, "loss": 1.4340442419052124, "lr": 9.934338157040953e-06, "epoch": 0.4585987261146497, "percentage": 15.19, "elapsed_time": "0:01:19", "remaining_time": "0:07:24"} +{"current_steps": 37, "total_steps": 237, "loss": 1.5494410991668701, "lr": 9.921889571113629e-06, "epoch": 0.4713375796178344, "percentage": 15.61, "elapsed_time": "0:01:21", "remaining_time": "0:07:21"} +{"current_steps": 38, "total_steps": 237, "loss": 1.4130847454071045, "lr": 9.90837029325229e-06, "epoch": 0.4840764331210191, "percentage": 16.03, "elapsed_time": "0:01:23", "remaining_time": "0:07:18"} +{"current_steps": 39, "total_steps": 237, "loss": 1.4265036582946777, "lr": 9.893783264396903e-06, "epoch": 0.4968152866242038, "percentage": 16.46, "elapsed_time": "0:01:25", "remaining_time": "0:07:15"} +{"current_steps": 40, "total_steps": 237, "loss": 1.4373618364334106, "lr": 9.878131657762535e-06, "epoch": 0.5095541401273885, "percentage": 16.88, "elapsed_time": "0:01:27", "remaining_time": "0:07:12"} +{"current_steps": 41, "total_steps": 237, "loss": 1.4085681438446045, "lr": 9.861418878149056e-06, "epoch": 0.5222929936305732, "percentage": 17.3, "elapsed_time": "0:01:30", "remaining_time": "0:07:10"} +{"current_steps": 42, "total_steps": 237, "loss": 1.452268123626709, "lr": 9.843648561200476e-06, "epoch": 0.535031847133758, "percentage": 17.72, "elapsed_time": "0:01:32", "remaining_time": "0:07:07"} +{"current_steps": 43, "total_steps": 237, "loss": 1.6110832691192627, "lr": 9.82482457261405e-06, "epoch": 0.5477707006369427, "percentage": 18.14, "elapsed_time": "0:01:34", "remaining_time": "0:07:04"} +{"current_steps": 44, "total_steps": 237, "loss": 1.4959537982940674, "lr": 9.80495100729936e-06, "epoch": 0.5605095541401274, "percentage": 18.57, "elapsed_time": "0:01:36", "remaining_time": "0:07:02"} +{"current_steps": 45, "total_steps": 237, "loss": 1.4369564056396484, "lr": 9.784032188487507e-06, "epoch": 0.5732484076433121, "percentage": 18.99, "elapsed_time": "0:01:38", "remaining_time": "0:06:59"} +{"current_steps": 46, "total_steps": 237, "loss": 1.5659615993499756, "lr": 9.762072666790658e-06, "epoch": 0.5859872611464968, "percentage": 19.41, "elapsed_time": "0:01:40", "remaining_time": "0:06:57"} +{"current_steps": 47, "total_steps": 237, "loss": 1.6203088760375977, "lr": 9.73907721921212e-06, "epoch": 0.5987261146496815, "percentage": 19.83, "elapsed_time": "0:01:42", "remaining_time": "0:06:54"} +{"current_steps": 48, "total_steps": 237, "loss": 1.4394254684448242, "lr": 9.715050848107167e-06, "epoch": 0.6114649681528662, "percentage": 20.25, "elapsed_time": "0:01:44", "remaining_time": "0:06:52"} +{"current_steps": 49, "total_steps": 237, "loss": 1.3903216123580933, "lr": 9.689998780094839e-06, "epoch": 0.6242038216560509, "percentage": 20.68, "elapsed_time": "0:01:46", "remaining_time": "0:06:49"} +{"current_steps": 50, "total_steps": 237, "loss": 1.4829354286193848, "lr": 9.663926464920959e-06, "epoch": 0.6369426751592356, "percentage": 21.1, "elapsed_time": "0:01:48", "remaining_time": "0:06:46"} +{"current_steps": 51, "total_steps": 237, "loss": 1.5274395942687988, "lr": 9.636839574272623e-06, "epoch": 0.6496815286624203, "percentage": 21.52, "elapsed_time": "0:01:50", "remaining_time": "0:06:44"} +{"current_steps": 52, "total_steps": 237, "loss": 1.4694490432739258, "lr": 9.608744000544392e-06, "epoch": 0.6624203821656051, "percentage": 21.94, "elapsed_time": "0:01:52", "remaining_time": "0:06:41"} +{"current_steps": 53, "total_steps": 237, "loss": 1.2353503704071045, "lr": 9.579645855556481e-06, "epoch": 0.6751592356687898, "percentage": 22.36, "elapsed_time": "0:01:55", "remaining_time": "0:06:39"} +{"current_steps": 54, "total_steps": 237, "loss": 1.4008901119232178, "lr": 9.54955146922521e-06, "epoch": 0.6878980891719745, "percentage": 22.78, "elapsed_time": "0:01:57", "remaining_time": "0:06:37"} +{"current_steps": 55, "total_steps": 237, "loss": 1.3539741039276123, "lr": 9.51846738818602e-06, "epoch": 0.7006369426751592, "percentage": 23.21, "elapsed_time": "0:01:59", "remaining_time": "0:06:35"} +{"current_steps": 56, "total_steps": 237, "loss": 1.3163714408874512, "lr": 9.48640037436934e-06, "epoch": 0.7133757961783439, "percentage": 23.63, "elapsed_time": "0:02:01", "remaining_time": "0:06:32"} +{"current_steps": 57, "total_steps": 237, "loss": 1.3809059858322144, "lr": 9.453357403529609e-06, "epoch": 0.7261146496815286, "percentage": 24.05, "elapsed_time": "0:02:03", "remaining_time": "0:06:30"} +{"current_steps": 58, "total_steps": 237, "loss": 1.458146572113037, "lr": 9.419345663727805e-06, "epoch": 0.7388535031847133, "percentage": 24.47, "elapsed_time": "0:02:05", "remaining_time": "0:06:27"} +{"current_steps": 59, "total_steps": 237, "loss": 1.329193115234375, "lr": 9.38437255376777e-06, "epoch": 0.7515923566878981, "percentage": 24.89, "elapsed_time": "0:02:07", "remaining_time": "0:06:25"} +{"current_steps": 60, "total_steps": 237, "loss": 1.4500741958618164, "lr": 9.348445681586703e-06, "epoch": 0.7643312101910829, "percentage": 25.32, "elapsed_time": "0:02:09", "remaining_time": "0:06:22"} +{"current_steps": 61, "total_steps": 237, "loss": 1.4562097787857056, "lr": 9.31157286260014e-06, "epoch": 0.7770700636942676, "percentage": 25.74, "elapsed_time": "0:02:11", "remaining_time": "0:06:20"} +{"current_steps": 62, "total_steps": 237, "loss": 1.3661162853240967, "lr": 9.273762118001837e-06, "epoch": 0.7898089171974523, "percentage": 26.16, "elapsed_time": "0:02:13", "remaining_time": "0:06:17"} +{"current_steps": 63, "total_steps": 237, "loss": 1.3168445825576782, "lr": 9.235021673018849e-06, "epoch": 0.802547770700637, "percentage": 26.58, "elapsed_time": "0:02:15", "remaining_time": "0:06:15"} +{"current_steps": 64, "total_steps": 237, "loss": 1.3281530141830444, "lr": 9.195359955122244e-06, "epoch": 0.8152866242038217, "percentage": 27.0, "elapsed_time": "0:02:18", "remaining_time": "0:06:13"} +{"current_steps": 65, "total_steps": 237, "loss": 1.3520253896713257, "lr": 9.15478559219382e-06, "epoch": 0.8280254777070064, "percentage": 27.43, "elapsed_time": "0:02:20", "remaining_time": "0:06:10"} +{"current_steps": 66, "total_steps": 237, "loss": 1.4982115030288696, "lr": 9.113307410649222e-06, "epoch": 0.8407643312101911, "percentage": 27.85, "elapsed_time": "0:02:22", "remaining_time": "0:06:08"} +{"current_steps": 67, "total_steps": 237, "loss": 1.402880311012268, "lr": 9.070934433517872e-06, "epoch": 0.8535031847133758, "percentage": 28.27, "elapsed_time": "0:02:24", "remaining_time": "0:06:05"} +{"current_steps": 68, "total_steps": 237, "loss": 1.4359843730926514, "lr": 9.027675878480131e-06, "epoch": 0.8662420382165605, "percentage": 28.69, "elapsed_time": "0:02:26", "remaining_time": "0:06:03"} +{"current_steps": 69, "total_steps": 237, "loss": 1.4095585346221924, "lr": 8.983541155862114e-06, "epoch": 0.8789808917197452, "percentage": 29.11, "elapsed_time": "0:02:28", "remaining_time": "0:06:01"} +{"current_steps": 70, "total_steps": 237, "loss": 1.3254384994506836, "lr": 8.938539866588593e-06, "epoch": 0.89171974522293, "percentage": 29.54, "elapsed_time": "0:02:30", "remaining_time": "0:05:58"} +{"current_steps": 71, "total_steps": 237, "loss": 1.3895121812820435, "lr": 8.892681800094447e-06, "epoch": 0.9044585987261147, "percentage": 29.96, "elapsed_time": "0:02:32", "remaining_time": "0:05:56"} +{"current_steps": 72, "total_steps": 237, "loss": 1.5136423110961914, "lr": 8.845976932195104e-06, "epoch": 0.9171974522292994, "percentage": 30.38, "elapsed_time": "0:02:34", "remaining_time": "0:05:54"} +{"current_steps": 73, "total_steps": 237, "loss": 1.560758352279663, "lr": 8.798435422916425e-06, "epoch": 0.9299363057324841, "percentage": 30.8, "elapsed_time": "0:02:36", "remaining_time": "0:05:51"} +{"current_steps": 74, "total_steps": 237, "loss": 1.2931057214736938, "lr": 8.750067614284534e-06, "epoch": 0.9426751592356688, "percentage": 31.22, "elapsed_time": "0:02:38", "remaining_time": "0:05:49"} +{"current_steps": 75, "total_steps": 237, "loss": 1.5124843120574951, "lr": 8.700884028076042e-06, "epoch": 0.9554140127388535, "percentage": 31.65, "elapsed_time": "0:02:40", "remaining_time": "0:05:47"} +{"current_steps": 76, "total_steps": 237, "loss": 1.317713737487793, "lr": 8.650895363529172e-06, "epoch": 0.9681528662420382, "percentage": 32.07, "elapsed_time": "0:02:42", "remaining_time": "0:05:45"} +{"current_steps": 77, "total_steps": 237, "loss": 1.3039919137954712, "lr": 8.600112495016289e-06, "epoch": 0.9808917197452229, "percentage": 32.49, "elapsed_time": "0:02:45", "remaining_time": "0:05:42"} +{"current_steps": 78, "total_steps": 237, "loss": 1.495795488357544, "lr": 8.548546469678311e-06, "epoch": 0.9936305732484076, "percentage": 32.91, "elapsed_time": "0:02:47", "remaining_time": "0:05:40"} +{"current_steps": 79, "total_steps": 237, "loss": 1.429541826248169, "lr": 8.496208505021572e-06, "epoch": 1.0, "percentage": 33.33, "elapsed_time": "0:02:48", "remaining_time": "0:05:36"} +{"current_steps": 80, "total_steps": 237, "loss": 0.8995598554611206, "lr": 8.443109986477574e-06, "epoch": 1.0127388535031847, "percentage": 33.76, "elapsed_time": "0:02:50", "remaining_time": "0:05:34"} +{"current_steps": 81, "total_steps": 237, "loss": 0.63990318775177, "lr": 8.389262464926256e-06, "epoch": 1.0254777070063694, "percentage": 34.18, "elapsed_time": "0:02:52", "remaining_time": "0:05:32"} +{"current_steps": 82, "total_steps": 237, "loss": 0.6055729985237122, "lr": 8.334677654183254e-06, "epoch": 1.0382165605095541, "percentage": 34.6, "elapsed_time": "0:02:54", "remaining_time": "0:05:30"} +{"current_steps": 83, "total_steps": 237, "loss": 0.7356538772583008, "lr": 8.279367428451703e-06, "epoch": 1.0509554140127388, "percentage": 35.02, "elapsed_time": "0:02:57", "remaining_time": "0:05:28"} +{"current_steps": 84, "total_steps": 237, "loss": 0.692323625087738, "lr": 8.223343819739164e-06, "epoch": 1.0636942675159236, "percentage": 35.44, "elapsed_time": "0:02:59", "remaining_time": "0:05:26"} +{"current_steps": 85, "total_steps": 237, "loss": 0.6772887706756592, "lr": 8.166619015240236e-06, "epoch": 1.0764331210191083, "percentage": 35.86, "elapsed_time": "0:03:01", "remaining_time": "0:05:24"} +{"current_steps": 86, "total_steps": 237, "loss": 0.5514630675315857, "lr": 8.109205354685367e-06, "epoch": 1.089171974522293, "percentage": 36.29, "elapsed_time": "0:03:03", "remaining_time": "0:05:21"} +{"current_steps": 87, "total_steps": 237, "loss": 0.6684471964836121, "lr": 8.051115327656538e-06, "epoch": 1.1019108280254777, "percentage": 36.71, "elapsed_time": "0:03:05", "remaining_time": "0:05:19"} +{"current_steps": 88, "total_steps": 237, "loss": 0.5766518712043762, "lr": 7.992361570870289e-06, "epoch": 1.1146496815286624, "percentage": 37.13, "elapsed_time": "0:03:07", "remaining_time": "0:05:17"} +{"current_steps": 89, "total_steps": 237, "loss": 0.5921903848648071, "lr": 7.932956865428792e-06, "epoch": 1.127388535031847, "percentage": 37.55, "elapsed_time": "0:03:09", "remaining_time": "0:05:15"} +{"current_steps": 90, "total_steps": 237, "loss": 0.592995285987854, "lr": 7.872914134039485e-06, "epoch": 1.1401273885350318, "percentage": 37.97, "elapsed_time": "0:03:11", "remaining_time": "0:05:13"} +{"current_steps": 91, "total_steps": 237, "loss": 0.5482683181762695, "lr": 7.812246438203905e-06, "epoch": 1.1528662420382165, "percentage": 38.4, "elapsed_time": "0:03:13", "remaining_time": "0:05:10"} +{"current_steps": 92, "total_steps": 237, "loss": 0.6826972365379333, "lr": 7.750966975376328e-06, "epoch": 1.1656050955414012, "percentage": 38.82, "elapsed_time": "0:03:15", "remaining_time": "0:05:08"} +{"current_steps": 93, "total_steps": 237, "loss": 0.5954027771949768, "lr": 7.689089076092851e-06, "epoch": 1.178343949044586, "percentage": 39.24, "elapsed_time": "0:03:18", "remaining_time": "0:05:06"} +{"current_steps": 94, "total_steps": 237, "loss": 0.6095083355903625, "lr": 7.626626201071494e-06, "epoch": 1.1910828025477707, "percentage": 39.66, "elapsed_time": "0:03:20", "remaining_time": "0:05:04"} +{"current_steps": 95, "total_steps": 237, "loss": 0.709877610206604, "lr": 7.563591938284012e-06, "epoch": 1.2038216560509554, "percentage": 40.08, "elapsed_time": "0:03:22", "remaining_time": "0:05:02"} +{"current_steps": 96, "total_steps": 237, "loss": 0.5784502029418945, "lr": 7.500000000000001e-06, "epoch": 1.21656050955414, "percentage": 40.51, "elapsed_time": "0:03:24", "remaining_time": "0:05:00"} +{"current_steps": 97, "total_steps": 237, "loss": 0.5837893486022949, "lr": 7.4358642198039835e-06, "epoch": 1.2292993630573248, "percentage": 40.93, "elapsed_time": "0:03:26", "remaining_time": "0:04:57"} +{"current_steps": 98, "total_steps": 237, "loss": 0.7246421575546265, "lr": 7.371198549586091e-06, "epoch": 1.2420382165605095, "percentage": 41.35, "elapsed_time": "0:03:28", "remaining_time": "0:04:55"} +{"current_steps": 99, "total_steps": 237, "loss": 0.5735586285591125, "lr": 7.306017056507018e-06, "epoch": 1.2547770700636942, "percentage": 41.77, "elapsed_time": "0:03:30", "remaining_time": "0:04:53"} +{"current_steps": 100, "total_steps": 237, "loss": 0.5463488101959229, "lr": 7.240333919937893e-06, "epoch": 1.267515923566879, "percentage": 42.19, "elapsed_time": "0:03:32", "remaining_time": "0:04:51"} +{"current_steps": 101, "total_steps": 237, "loss": 0.5633252859115601, "lr": 7.174163428375748e-06, "epoch": 1.2802547770700636, "percentage": 42.62, "elapsed_time": "0:03:34", "remaining_time": "0:04:48"} +{"current_steps": 102, "total_steps": 237, "loss": 0.5037230253219604, "lr": 7.107519976335241e-06, "epoch": 1.2929936305732483, "percentage": 43.04, "elapsed_time": "0:03:36", "remaining_time": "0:04:46"} +{"current_steps": 103, "total_steps": 237, "loss": 0.5365867614746094, "lr": 7.040418061217325e-06, "epoch": 1.305732484076433, "percentage": 43.46, "elapsed_time": "0:03:38", "remaining_time": "0:04:44"} +{"current_steps": 104, "total_steps": 237, "loss": 0.6433064937591553, "lr": 6.972872280155528e-06, "epoch": 1.3184713375796178, "percentage": 43.88, "elapsed_time": "0:03:40", "remaining_time": "0:04:42"} +{"current_steps": 105, "total_steps": 237, "loss": 0.6543390154838562, "lr": 6.9048973268405375e-06, "epoch": 1.3312101910828025, "percentage": 44.3, "elapsed_time": "0:03:42", "remaining_time": "0:04:40"} +{"current_steps": 106, "total_steps": 237, "loss": 0.6132720708847046, "lr": 6.836507988323785e-06, "epoch": 1.3439490445859872, "percentage": 44.73, "elapsed_time": "0:03:44", "remaining_time": "0:04:38"} +{"current_steps": 107, "total_steps": 237, "loss": 0.6079248189926147, "lr": 6.767719141800718e-06, "epoch": 1.356687898089172, "percentage": 45.15, "elapsed_time": "0:03:47", "remaining_time": "0:04:35"} +{"current_steps": 108, "total_steps": 237, "loss": 0.6232650279998779, "lr": 6.698545751374465e-06, "epoch": 1.3694267515923566, "percentage": 45.57, "elapsed_time": "0:03:49", "remaining_time": "0:04:33"} +{"current_steps": 109, "total_steps": 237, "loss": 0.5911256074905396, "lr": 6.629002864800589e-06, "epoch": 1.3821656050955413, "percentage": 45.99, "elapsed_time": "0:03:51", "remaining_time": "0:04:31"} +{"current_steps": 110, "total_steps": 237, "loss": 0.6404790282249451, "lr": 6.55910561021365e-06, "epoch": 1.394904458598726, "percentage": 46.41, "elapsed_time": "0:03:53", "remaining_time": "0:04:29"} +{"current_steps": 111, "total_steps": 237, "loss": 0.6661736965179443, "lr": 6.488869192836279e-06, "epoch": 1.4076433121019107, "percentage": 46.84, "elapsed_time": "0:03:55", "remaining_time": "0:04:27"} +{"current_steps": 112, "total_steps": 237, "loss": 0.5621084570884705, "lr": 6.418308891671484e-06, "epoch": 1.4203821656050954, "percentage": 47.26, "elapsed_time": "0:03:57", "remaining_time": "0:04:24"} +{"current_steps": 113, "total_steps": 237, "loss": 0.5913956165313721, "lr": 6.347440056178904e-06, "epoch": 1.4331210191082802, "percentage": 47.68, "elapsed_time": "0:03:59", "remaining_time": "0:04:22"} +{"current_steps": 114, "total_steps": 237, "loss": 0.5895659327507019, "lr": 6.27627810293574e-06, "epoch": 1.4458598726114649, "percentage": 48.1, "elapsed_time": "0:04:01", "remaining_time": "0:04:20"} +{"current_steps": 115, "total_steps": 237, "loss": 0.6066327691078186, "lr": 6.204838512283073e-06, "epoch": 1.4585987261146496, "percentage": 48.52, "elapsed_time": "0:04:03", "remaining_time": "0:04:18"} +{"current_steps": 116, "total_steps": 237, "loss": 0.579125165939331, "lr": 6.133136824958334e-06, "epoch": 1.4713375796178343, "percentage": 48.95, "elapsed_time": "0:04:05", "remaining_time": "0:04:16"} +{"current_steps": 117, "total_steps": 237, "loss": 0.5661747455596924, "lr": 6.061188638714616e-06, "epoch": 1.484076433121019, "percentage": 49.37, "elapsed_time": "0:04:07", "remaining_time": "0:04:14"} +{"current_steps": 118, "total_steps": 237, "loss": 0.5881543159484863, "lr": 5.989009604927587e-06, "epoch": 1.4968152866242037, "percentage": 49.79, "elapsed_time": "0:04:09", "remaining_time": "0:04:12"} +{"current_steps": 119, "total_steps": 237, "loss": 0.6381370425224304, "lr": 5.916615425190744e-06, "epoch": 1.5095541401273884, "percentage": 50.21, "elapsed_time": "0:04:12", "remaining_time": "0:04:09"} +{"current_steps": 120, "total_steps": 237, "loss": 0.5820121765136719, "lr": 5.844021847899735e-06, "epoch": 1.5222929936305731, "percentage": 50.63, "elapsed_time": "0:04:14", "remaining_time": "0:04:07"} +{"current_steps": 121, "total_steps": 237, "loss": 0.5244691371917725, "lr": 5.771244664826512e-06, "epoch": 1.5350318471337578, "percentage": 51.05, "elapsed_time": "0:04:16", "remaining_time": "0:04:05"} +{"current_steps": 122, "total_steps": 237, "loss": 0.6596621870994568, "lr": 5.698299707684031e-06, "epoch": 1.5477707006369426, "percentage": 51.48, "elapsed_time": "0:04:18", "remaining_time": "0:04:03"} +{"current_steps": 123, "total_steps": 237, "loss": 0.6240249872207642, "lr": 5.6252028446822805e-06, "epoch": 1.5605095541401273, "percentage": 51.9, "elapsed_time": "0:04:20", "remaining_time": "0:04:01"} +{"current_steps": 124, "total_steps": 237, "loss": 0.6121684312820435, "lr": 5.55196997707635e-06, "epoch": 1.573248407643312, "percentage": 52.32, "elapsed_time": "0:04:22", "remaining_time": "0:03:59"} +{"current_steps": 125, "total_steps": 237, "loss": 0.581444263458252, "lr": 5.478617035707337e-06, "epoch": 1.5859872611464967, "percentage": 52.74, "elapsed_time": "0:04:24", "remaining_time": "0:03:56"} +{"current_steps": 126, "total_steps": 237, "loss": 0.5702801942825317, "lr": 5.4051599775368e-06, "epoch": 1.5987261146496814, "percentage": 53.16, "elapsed_time": "0:04:26", "remaining_time": "0:03:55"} +{"current_steps": 127, "total_steps": 237, "loss": 0.643683671951294, "lr": 5.33161478217552e-06, "epoch": 1.611464968152866, "percentage": 53.59, "elapsed_time": "0:04:28", "remaining_time": "0:03:52"} +{"current_steps": 128, "total_steps": 237, "loss": 0.6429088115692139, "lr": 5.257997448407366e-06, "epoch": 1.6242038216560508, "percentage": 54.01, "elapsed_time": "0:04:31", "remaining_time": "0:03:50"} +{"current_steps": 129, "total_steps": 237, "loss": 0.5036097764968872, "lr": 5.184323990708959e-06, "epoch": 1.6369426751592355, "percentage": 54.43, "elapsed_time": "0:04:33", "remaining_time": "0:03:48"} +{"current_steps": 130, "total_steps": 237, "loss": 0.6377817392349243, "lr": 5.110610435765935e-06, "epoch": 1.6496815286624202, "percentage": 54.85, "elapsed_time": "0:04:35", "remaining_time": "0:03:46"} +{"current_steps": 131, "total_steps": 237, "loss": 0.5092718601226807, "lr": 5.0368728189865624e-06, "epoch": 1.662420382165605, "percentage": 55.27, "elapsed_time": "0:04:37", "remaining_time": "0:03:44"} +{"current_steps": 132, "total_steps": 237, "loss": 0.6005362868309021, "lr": 4.9631271810134375e-06, "epoch": 1.6751592356687897, "percentage": 55.7, "elapsed_time": "0:04:39", "remaining_time": "0:03:42"} +{"current_steps": 133, "total_steps": 237, "loss": 0.4808087944984436, "lr": 4.8893895642340665e-06, "epoch": 1.6878980891719744, "percentage": 56.12, "elapsed_time": "0:04:41", "remaining_time": "0:03:40"} +{"current_steps": 134, "total_steps": 237, "loss": 0.6739586591720581, "lr": 4.815676009291044e-06, "epoch": 1.700636942675159, "percentage": 56.54, "elapsed_time": "0:04:43", "remaining_time": "0:03:38"} +{"current_steps": 135, "total_steps": 237, "loss": 0.5722870826721191, "lr": 4.742002551592635e-06, "epoch": 1.7133757961783438, "percentage": 56.96, "elapsed_time": "0:04:45", "remaining_time": "0:03:36"} +{"current_steps": 136, "total_steps": 237, "loss": 0.5560994148254395, "lr": 4.668385217824482e-06, "epoch": 1.7261146496815285, "percentage": 57.38, "elapsed_time": "0:04:48", "remaining_time": "0:03:33"} +{"current_steps": 137, "total_steps": 237, "loss": 0.6376844644546509, "lr": 4.594840022463201e-06, "epoch": 1.7388535031847132, "percentage": 57.81, "elapsed_time": "0:04:50", "remaining_time": "0:03:31"} +{"current_steps": 138, "total_steps": 237, "loss": 0.5070189237594604, "lr": 4.5213829642926635e-06, "epoch": 1.7515923566878981, "percentage": 58.23, "elapsed_time": "0:04:52", "remaining_time": "0:03:29"} +{"current_steps": 139, "total_steps": 237, "loss": 0.6301469206809998, "lr": 4.4480300229236525e-06, "epoch": 1.7643312101910829, "percentage": 58.65, "elapsed_time": "0:04:54", "remaining_time": "0:03:27"} +{"current_steps": 140, "total_steps": 237, "loss": 0.5686060190200806, "lr": 4.374797155317721e-06, "epoch": 1.7770700636942676, "percentage": 59.07, "elapsed_time": "0:04:56", "remaining_time": "0:03:25"} +{"current_steps": 141, "total_steps": 237, "loss": 0.5702610015869141, "lr": 4.30170029231597e-06, "epoch": 1.7898089171974523, "percentage": 59.49, "elapsed_time": "0:04:58", "remaining_time": "0:03:23"} +{"current_steps": 142, "total_steps": 237, "loss": 0.5375156402587891, "lr": 4.228755335173488e-06, "epoch": 1.802547770700637, "percentage": 59.92, "elapsed_time": "0:05:00", "remaining_time": "0:03:21"} +{"current_steps": 143, "total_steps": 237, "loss": 0.588652491569519, "lr": 4.155978152100266e-06, "epoch": 1.8152866242038217, "percentage": 60.34, "elapsed_time": "0:05:02", "remaining_time": "0:03:18"} +{"current_steps": 144, "total_steps": 237, "loss": 0.6560136079788208, "lr": 4.0833845748092586e-06, "epoch": 1.8280254777070064, "percentage": 60.76, "elapsed_time": "0:05:04", "remaining_time": "0:03:16"} +{"current_steps": 145, "total_steps": 237, "loss": 0.5707780718803406, "lr": 4.010990395072414e-06, "epoch": 1.8407643312101911, "percentage": 61.18, "elapsed_time": "0:05:06", "remaining_time": "0:03:14"} +{"current_steps": 146, "total_steps": 237, "loss": 0.578855574131012, "lr": 3.938811361285386e-06, "epoch": 1.8535031847133758, "percentage": 61.6, "elapsed_time": "0:05:08", "remaining_time": "0:03:12"} +{"current_steps": 147, "total_steps": 237, "loss": 0.7337894439697266, "lr": 3.866863175041666e-06, "epoch": 1.8662420382165605, "percentage": 62.03, "elapsed_time": "0:05:10", "remaining_time": "0:03:10"} +{"current_steps": 148, "total_steps": 237, "loss": 0.6584663391113281, "lr": 3.7951614877169285e-06, "epoch": 1.8789808917197452, "percentage": 62.45, "elapsed_time": "0:05:13", "remaining_time": "0:03:08"} +{"current_steps": 149, "total_steps": 237, "loss": 0.5132451057434082, "lr": 3.7237218970642624e-06, "epoch": 1.89171974522293, "percentage": 62.87, "elapsed_time": "0:05:15", "remaining_time": "0:03:06"} +{"current_steps": 150, "total_steps": 237, "loss": 0.5699691772460938, "lr": 3.6525599438210956e-06, "epoch": 1.9044585987261147, "percentage": 63.29, "elapsed_time": "0:05:17", "remaining_time": "0:03:04"} +{"current_steps": 151, "total_steps": 237, "loss": 0.6117175817489624, "lr": 3.5816911083285165e-06, "epoch": 1.9171974522292994, "percentage": 63.71, "elapsed_time": "0:05:19", "remaining_time": "0:03:01"} +{"current_steps": 152, "total_steps": 237, "loss": 0.48447686433792114, "lr": 3.511130807163724e-06, "epoch": 1.929936305732484, "percentage": 64.14, "elapsed_time": "0:05:21", "remaining_time": "0:02:59"} +{"current_steps": 153, "total_steps": 237, "loss": 0.5775331854820251, "lr": 3.440894389786352e-06, "epoch": 1.9426751592356688, "percentage": 64.56, "elapsed_time": "0:05:23", "remaining_time": "0:02:57"} +{"current_steps": 154, "total_steps": 237, "loss": 0.6822047829627991, "lr": 3.370997135199413e-06, "epoch": 1.9554140127388535, "percentage": 64.98, "elapsed_time": "0:05:25", "remaining_time": "0:02:55"} +{"current_steps": 155, "total_steps": 237, "loss": 0.620025098323822, "lr": 3.3014542486255365e-06, "epoch": 1.9681528662420382, "percentage": 65.4, "elapsed_time": "0:05:27", "remaining_time": "0:02:53"} +{"current_steps": 156, "total_steps": 237, "loss": 0.6051990985870361, "lr": 3.2322808581992825e-06, "epoch": 1.980891719745223, "percentage": 65.82, "elapsed_time": "0:05:29", "remaining_time": "0:02:51"} +{"current_steps": 157, "total_steps": 237, "loss": 0.5013089776039124, "lr": 3.1634920116762175e-06, "epoch": 1.9936305732484076, "percentage": 66.24, "elapsed_time": "0:05:31", "remaining_time": "0:02:49"} +{"current_steps": 158, "total_steps": 237, "loss": 0.41039198637008667, "lr": 3.0951026731594634e-06, "epoch": 2.0, "percentage": 66.67, "elapsed_time": "0:05:32", "remaining_time": "0:02:46"} +{"current_steps": 159, "total_steps": 237, "loss": 0.14488917589187622, "lr": 3.0271277198444737e-06, "epoch": 2.0127388535031847, "percentage": 67.09, "elapsed_time": "0:05:35", "remaining_time": "0:02:44"} +{"current_steps": 160, "total_steps": 237, "loss": 0.17139403522014618, "lr": 2.9595819387826753e-06, "epoch": 2.0254777070063694, "percentage": 67.51, "elapsed_time": "0:05:37", "remaining_time": "0:02:42"} +{"current_steps": 161, "total_steps": 237, "loss": 0.13938947021961212, "lr": 2.89248002366476e-06, "epoch": 2.038216560509554, "percentage": 67.93, "elapsed_time": "0:05:39", "remaining_time": "0:02:40"} +{"current_steps": 162, "total_steps": 237, "loss": 0.19142913818359375, "lr": 2.8258365716242543e-06, "epoch": 2.050955414012739, "percentage": 68.35, "elapsed_time": "0:05:41", "remaining_time": "0:02:38"} +{"current_steps": 163, "total_steps": 237, "loss": 0.32667019963264465, "lr": 2.7596660800621076e-06, "epoch": 2.0636942675159236, "percentage": 68.78, "elapsed_time": "0:05:43", "remaining_time": "0:02:35"} +{"current_steps": 164, "total_steps": 237, "loss": 0.16923490166664124, "lr": 2.6939829434929834e-06, "epoch": 2.0764331210191083, "percentage": 69.2, "elapsed_time": "0:05:45", "remaining_time": "0:02:33"} +{"current_steps": 165, "total_steps": 237, "loss": 0.16544359922409058, "lr": 2.6288014504139104e-06, "epoch": 2.089171974522293, "percentage": 69.62, "elapsed_time": "0:05:47", "remaining_time": "0:02:31"} +{"current_steps": 166, "total_steps": 237, "loss": 0.13166563212871552, "lr": 2.5641357801960186e-06, "epoch": 2.1019108280254777, "percentage": 70.04, "elapsed_time": "0:05:49", "remaining_time": "0:02:29"} +{"current_steps": 167, "total_steps": 237, "loss": 0.1502484679222107, "lr": 2.5000000000000015e-06, "epoch": 2.1146496815286624, "percentage": 70.46, "elapsed_time": "0:05:51", "remaining_time": "0:02:27"} +{"current_steps": 168, "total_steps": 237, "loss": 0.12001603841781616, "lr": 2.4364080617159885e-06, "epoch": 2.127388535031847, "percentage": 70.89, "elapsed_time": "0:05:53", "remaining_time": "0:02:25"} +{"current_steps": 169, "total_steps": 237, "loss": 0.16388744115829468, "lr": 2.373373798928507e-06, "epoch": 2.140127388535032, "percentage": 71.31, "elapsed_time": "0:05:56", "remaining_time": "0:02:23"} +{"current_steps": 170, "total_steps": 237, "loss": 0.17085227370262146, "lr": 2.310910923907149e-06, "epoch": 2.1528662420382165, "percentage": 71.73, "elapsed_time": "0:05:58", "remaining_time": "0:02:21"} +{"current_steps": 171, "total_steps": 237, "loss": 0.1649709939956665, "lr": 2.249033024623672e-06, "epoch": 2.1656050955414012, "percentage": 72.15, "elapsed_time": "0:06:00", "remaining_time": "0:02:19"} +{"current_steps": 172, "total_steps": 237, "loss": 0.13725437223911285, "lr": 2.187753561796097e-06, "epoch": 2.178343949044586, "percentage": 72.57, "elapsed_time": "0:06:02", "remaining_time": "0:02:16"} +{"current_steps": 173, "total_steps": 237, "loss": 0.14223095774650574, "lr": 2.127085865960516e-06, "epoch": 2.1910828025477707, "percentage": 73.0, "elapsed_time": "0:06:04", "remaining_time": "0:02:14"} +{"current_steps": 174, "total_steps": 237, "loss": 0.1432873010635376, "lr": 2.0670431345712092e-06, "epoch": 2.2038216560509554, "percentage": 73.42, "elapsed_time": "0:06:06", "remaining_time": "0:02:12"} +{"current_steps": 175, "total_steps": 237, "loss": 0.1355983018875122, "lr": 2.0076384291297134e-06, "epoch": 2.21656050955414, "percentage": 73.84, "elapsed_time": "0:06:08", "remaining_time": "0:02:10"} +{"current_steps": 176, "total_steps": 237, "loss": 0.13247933983802795, "lr": 1.9488846723434646e-06, "epoch": 2.229299363057325, "percentage": 74.26, "elapsed_time": "0:06:10", "remaining_time": "0:02:08"} +{"current_steps": 177, "total_steps": 237, "loss": 0.1308836191892624, "lr": 1.890794645314633e-06, "epoch": 2.2420382165605095, "percentage": 74.68, "elapsed_time": "0:06:13", "remaining_time": "0:02:06"} +{"current_steps": 178, "total_steps": 237, "loss": 0.15963426232337952, "lr": 1.8333809847597644e-06, "epoch": 2.254777070063694, "percentage": 75.11, "elapsed_time": "0:06:15", "remaining_time": "0:02:04"} +{"current_steps": 179, "total_steps": 237, "loss": 0.14805136620998383, "lr": 1.7766561802608374e-06, "epoch": 2.267515923566879, "percentage": 75.53, "elapsed_time": "0:06:17", "remaining_time": "0:02:02"} +{"current_steps": 180, "total_steps": 237, "loss": 0.12024472653865814, "lr": 1.7206325715483003e-06, "epoch": 2.2802547770700636, "percentage": 75.95, "elapsed_time": "0:06:19", "remaining_time": "0:02:00"} +{"current_steps": 181, "total_steps": 237, "loss": 0.11454702913761139, "lr": 1.665322345816746e-06, "epoch": 2.2929936305732483, "percentage": 76.37, "elapsed_time": "0:06:21", "remaining_time": "0:01:58"} +{"current_steps": 182, "total_steps": 237, "loss": 0.10992666333913803, "lr": 1.6107375350737437e-06, "epoch": 2.305732484076433, "percentage": 76.79, "elapsed_time": "0:06:23", "remaining_time": "0:01:55"} +{"current_steps": 183, "total_steps": 237, "loss": 0.09631110727787018, "lr": 1.556890013522428e-06, "epoch": 2.3184713375796178, "percentage": 77.22, "elapsed_time": "0:06:25", "remaining_time": "0:01:53"} +{"current_steps": 184, "total_steps": 237, "loss": 0.14856451749801636, "lr": 1.50379149497843e-06, "epoch": 2.3312101910828025, "percentage": 77.64, "elapsed_time": "0:06:27", "remaining_time": "0:01:51"} +{"current_steps": 185, "total_steps": 237, "loss": 0.09073778241872787, "lr": 1.4514535303216893e-06, "epoch": 2.343949044585987, "percentage": 78.06, "elapsed_time": "0:06:29", "remaining_time": "0:01:49"} +{"current_steps": 186, "total_steps": 237, "loss": 0.10596369206905365, "lr": 1.3998875049837141e-06, "epoch": 2.356687898089172, "percentage": 78.48, "elapsed_time": "0:06:31", "remaining_time": "0:01:47"} +{"current_steps": 187, "total_steps": 237, "loss": 0.1488298773765564, "lr": 1.3491046364708294e-06, "epoch": 2.3694267515923566, "percentage": 78.9, "elapsed_time": "0:06:34", "remaining_time": "0:01:45"} +{"current_steps": 188, "total_steps": 237, "loss": 0.13143031299114227, "lr": 1.2991159719239581e-06, "epoch": 2.3821656050955413, "percentage": 79.32, "elapsed_time": "0:06:36", "remaining_time": "0:01:43"} +{"current_steps": 189, "total_steps": 237, "loss": 0.12935219705104828, "lr": 1.249932385715467e-06, "epoch": 2.394904458598726, "percentage": 79.75, "elapsed_time": "0:06:38", "remaining_time": "0:01:41"} +{"current_steps": 190, "total_steps": 237, "loss": 0.10895463824272156, "lr": 1.2015645770835765e-06, "epoch": 2.4076433121019107, "percentage": 80.17, "elapsed_time": "0:06:40", "remaining_time": "0:01:39"} +{"current_steps": 191, "total_steps": 237, "loss": 0.11770664900541306, "lr": 1.1540230678048969e-06, "epoch": 2.4203821656050954, "percentage": 80.59, "elapsed_time": "0:06:42", "remaining_time": "0:01:36"} +{"current_steps": 192, "total_steps": 237, "loss": 0.12943175435066223, "lr": 1.1073181999055538e-06, "epoch": 2.43312101910828, "percentage": 81.01, "elapsed_time": "0:06:44", "remaining_time": "0:01:34"} +{"current_steps": 193, "total_steps": 237, "loss": 0.15990746021270752, "lr": 1.0614601334114099e-06, "epoch": 2.445859872611465, "percentage": 81.43, "elapsed_time": "0:06:46", "remaining_time": "0:01:32"} +{"current_steps": 194, "total_steps": 237, "loss": 0.0967484638094902, "lr": 1.016458844137887e-06, "epoch": 2.4585987261146496, "percentage": 81.86, "elapsed_time": "0:06:48", "remaining_time": "0:01:30"} +{"current_steps": 195, "total_steps": 237, "loss": 0.09274256229400635, "lr": 9.723241215198692e-07, "epoch": 2.4713375796178343, "percentage": 82.28, "elapsed_time": "0:06:50", "remaining_time": "0:01:28"} +{"current_steps": 196, "total_steps": 237, "loss": 0.12071307003498077, "lr": 9.290655664821296e-07, "epoch": 2.484076433121019, "percentage": 82.7, "elapsed_time": "0:06:52", "remaining_time": "0:01:26"} +{"current_steps": 197, "total_steps": 237, "loss": 0.14337831735610962, "lr": 8.866925893507805e-07, "epoch": 2.4968152866242037, "percentage": 83.12, "elapsed_time": "0:06:54", "remaining_time": "0:01:24"} +{"current_steps": 198, "total_steps": 237, "loss": 0.1311374008655548, "lr": 8.45214407806182e-07, "epoch": 2.5095541401273884, "percentage": 83.54, "elapsed_time": "0:06:56", "remaining_time": "0:01:22"} +{"current_steps": 199, "total_steps": 237, "loss": 0.12355434894561768, "lr": 8.046400448777575e-07, "epoch": 2.522292993630573, "percentage": 83.97, "elapsed_time": "0:06:58", "remaining_time": "0:01:19"} +{"current_steps": 200, "total_steps": 237, "loss": 0.11268627643585205, "lr": 7.649783269811523e-07, "epoch": 2.535031847133758, "percentage": 84.39, "elapsed_time": "0:07:01", "remaining_time": "0:01:17"} +{"current_steps": 201, "total_steps": 237, "loss": 0.1278030276298523, "lr": 7.26237881998163e-07, "epoch": 2.5477707006369426, "percentage": 84.81, "elapsed_time": "0:07:03", "remaining_time": "0:01:15"} +{"current_steps": 202, "total_steps": 237, "loss": 0.11588963866233826, "lr": 6.884271373998608e-07, "epoch": 2.5605095541401273, "percentage": 85.23, "elapsed_time": "0:07:05", "remaining_time": "0:01:13"} +{"current_steps": 203, "total_steps": 237, "loss": 0.11168617010116577, "lr": 6.515543184133e-07, "epoch": 2.573248407643312, "percentage": 85.65, "elapsed_time": "0:07:07", "remaining_time": "0:01:11"} +{"current_steps": 204, "total_steps": 237, "loss": 0.14677459001541138, "lr": 6.156274462322292e-07, "epoch": 2.5859872611464967, "percentage": 86.08, "elapsed_time": "0:07:09", "remaining_time": "0:01:09"} +{"current_steps": 205, "total_steps": 237, "loss": 0.1080314964056015, "lr": 5.806543362721945e-07, "epoch": 2.5987261146496814, "percentage": 86.5, "elapsed_time": "0:07:11", "remaining_time": "0:01:07"} +{"current_steps": 206, "total_steps": 237, "loss": 0.10917598009109497, "lr": 5.466425964703914e-07, "epoch": 2.611464968152866, "percentage": 86.92, "elapsed_time": "0:07:13", "remaining_time": "0:01:05"} +{"current_steps": 207, "total_steps": 237, "loss": 0.10850804299116135, "lr": 5.135996256306619e-07, "epoch": 2.624203821656051, "percentage": 87.34, "elapsed_time": "0:07:15", "remaining_time": "0:01:03"} +{"current_steps": 208, "total_steps": 237, "loss": 0.23395496606826782, "lr": 4.815326118139813e-07, "epoch": 2.6369426751592355, "percentage": 87.76, "elapsed_time": "0:07:17", "remaining_time": "0:01:01"} +{"current_steps": 209, "total_steps": 237, "loss": 0.09678040444850922, "lr": 4.5044853077479134e-07, "epoch": 2.6496815286624202, "percentage": 88.19, "elapsed_time": "0:07:19", "remaining_time": "0:00:58"} +{"current_steps": 210, "total_steps": 237, "loss": 0.09082137048244476, "lr": 4.203541444435211e-07, "epoch": 2.662420382165605, "percentage": 88.61, "elapsed_time": "0:07:21", "remaining_time": "0:00:56"} +{"current_steps": 211, "total_steps": 237, "loss": 0.12093393504619598, "lr": 3.9125599945560866e-07, "epoch": 2.6751592356687897, "percentage": 89.03, "elapsed_time": "0:07:23", "remaining_time": "0:00:54"} +{"current_steps": 212, "total_steps": 237, "loss": 0.12841008603572845, "lr": 3.631604257273774e-07, "epoch": 2.6878980891719744, "percentage": 89.45, "elapsed_time": "0:07:25", "remaining_time": "0:00:52"} +{"current_steps": 213, "total_steps": 237, "loss": 0.1454203575849533, "lr": 3.360735350790428e-07, "epoch": 2.700636942675159, "percentage": 89.87, "elapsed_time": "0:07:27", "remaining_time": "0:00:50"} +{"current_steps": 214, "total_steps": 237, "loss": 0.12103286385536194, "lr": 3.100012199051627e-07, "epoch": 2.713375796178344, "percentage": 90.3, "elapsed_time": "0:07:30", "remaining_time": "0:00:48"} +{"current_steps": 215, "total_steps": 237, "loss": 0.13519585132598877, "lr": 2.8494915189283325e-07, "epoch": 2.7261146496815285, "percentage": 90.72, "elapsed_time": "0:07:32", "remaining_time": "0:00:46"} +{"current_steps": 216, "total_steps": 237, "loss": 0.14792990684509277, "lr": 2.6092278078788004e-07, "epoch": 2.738853503184713, "percentage": 91.14, "elapsed_time": "0:07:34", "remaining_time": "0:00:44"} +{"current_steps": 217, "total_steps": 237, "loss": 0.1573294997215271, "lr": 2.3792733320934348e-07, "epoch": 2.7515923566878984, "percentage": 91.56, "elapsed_time": "0:07:36", "remaining_time": "0:00:42"} +{"current_steps": 218, "total_steps": 237, "loss": 0.15241427719593048, "lr": 2.1596781151249524e-07, "epoch": 2.7643312101910826, "percentage": 91.98, "elapsed_time": "0:07:38", "remaining_time": "0:00:39"} +{"current_steps": 219, "total_steps": 237, "loss": 0.11122366786003113, "lr": 1.9504899270064105e-07, "epoch": 2.777070063694268, "percentage": 92.41, "elapsed_time": "0:07:40", "remaining_time": "0:00:37"} +{"current_steps": 220, "total_steps": 237, "loss": 0.11351308226585388, "lr": 1.7517542738595071e-07, "epoch": 2.789808917197452, "percentage": 92.83, "elapsed_time": "0:07:42", "remaining_time": "0:00:35"} +{"current_steps": 221, "total_steps": 237, "loss": 0.1188071146607399, "lr": 1.5635143879952575e-07, "epoch": 2.802547770700637, "percentage": 93.25, "elapsed_time": "0:07:44", "remaining_time": "0:00:33"} +{"current_steps": 222, "total_steps": 237, "loss": 0.1164408028125763, "lr": 1.3858112185094418e-07, "epoch": 2.8152866242038215, "percentage": 93.67, "elapsed_time": "0:07:46", "remaining_time": "0:00:31"} +{"current_steps": 223, "total_steps": 237, "loss": 0.12760576605796814, "lr": 1.2186834223746612e-07, "epoch": 2.8280254777070066, "percentage": 94.09, "elapsed_time": "0:07:49", "remaining_time": "0:00:29"} +{"current_steps": 224, "total_steps": 237, "loss": 0.11487654596567154, "lr": 1.0621673560309798e-07, "epoch": 2.840764331210191, "percentage": 94.51, "elapsed_time": "0:07:51", "remaining_time": "0:00:27"} +{"current_steps": 225, "total_steps": 237, "loss": 0.11246581375598907, "lr": 9.162970674771177e-08, "epoch": 2.853503184713376, "percentage": 94.94, "elapsed_time": "0:07:53", "remaining_time": "0:00:25"} +{"current_steps": 226, "total_steps": 237, "loss": 0.09955516457557678, "lr": 7.81104288863721e-08, "epoch": 2.8662420382165603, "percentage": 95.36, "elapsed_time": "0:07:55", "remaining_time": "0:00:23"} +{"current_steps": 227, "total_steps": 237, "loss": 0.12330685555934906, "lr": 6.566184295904777e-08, "epoch": 2.8789808917197455, "percentage": 95.78, "elapsed_time": "0:07:57", "remaining_time": "0:00:21"} +{"current_steps": 228, "total_steps": 237, "loss": 0.12849846482276917, "lr": 5.4286656990847897e-08, "epoch": 2.8917197452229297, "percentage": 96.2, "elapsed_time": "0:07:59", "remaining_time": "0:00:18"} +{"current_steps": 229, "total_steps": 237, "loss": 0.11019767820835114, "lr": 4.398734550292716e-08, "epoch": 2.904458598726115, "percentage": 96.62, "elapsed_time": "0:08:01", "remaining_time": "0:00:16"} +{"current_steps": 230, "total_steps": 237, "loss": 0.10802481323480606, "lr": 3.476614897418573e-08, "epoch": 2.917197452229299, "percentage": 97.05, "elapsed_time": "0:08:03", "remaining_time": "0:00:14"} +{"current_steps": 231, "total_steps": 237, "loss": 0.11602732539176941, "lr": 2.6625073353884756e-08, "epoch": 2.9299363057324843, "percentage": 97.47, "elapsed_time": "0:08:06", "remaining_time": "0:00:12"} +{"current_steps": 232, "total_steps": 237, "loss": 0.12483286112546921, "lr": 1.9565889625275945e-08, "epoch": 2.9426751592356686, "percentage": 97.89, "elapsed_time": "0:08:08", "remaining_time": "0:00:10"} +{"current_steps": 233, "total_steps": 237, "loss": 0.10575878620147705, "lr": 1.3590133420350315e-08, "epoch": 2.9554140127388537, "percentage": 98.31, "elapsed_time": "0:08:10", "remaining_time": "0:00:08"} +{"current_steps": 234, "total_steps": 237, "loss": 0.14583438634872437, "lr": 8.699104685779835e-09, "epoch": 2.968152866242038, "percentage": 98.73, "elapsed_time": "0:08:12", "remaining_time": "0:00:06"} +{"current_steps": 235, "total_steps": 237, "loss": 0.12648674845695496, "lr": 4.89386740013198e-09, "epoch": 2.980891719745223, "percentage": 99.16, "elapsed_time": "0:08:14", "remaining_time": "0:00:04"} +{"current_steps": 236, "total_steps": 237, "loss": 0.1414915770292282, "lr": 2.1752493424148647e-09, "epoch": 2.9936305732484074, "percentage": 99.58, "elapsed_time": "0:08:16", "remaining_time": "0:00:02"} +{"current_steps": 237, "total_steps": 237, "loss": 0.0598013773560524, "lr": 5.438419120062933e-10, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "0:08:17", "remaining_time": "0:00:00"} +{"current_steps": 237, "total_steps": 237, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "0:09:32", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..6c28e4f --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1702 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 237, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012738853503184714, + "grad_norm": 27.578015588062595, + "learning_rate": 0.0, + "loss": 2.2484302520751953, + "step": 1 + }, + { + "epoch": 0.025477707006369428, + "grad_norm": 27.351013009442298, + "learning_rate": 4.1666666666666667e-07, + "loss": 2.215416193008423, + "step": 2 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 29.00648618644296, + "learning_rate": 8.333333333333333e-07, + "loss": 2.1969661712646484, + "step": 3 + }, + { + "epoch": 0.050955414012738856, + "grad_norm": 24.279613305984917, + "learning_rate": 1.25e-06, + "loss": 2.1304638385772705, + "step": 4 + }, + { + "epoch": 0.06369426751592357, + "grad_norm": 24.548939602077972, + "learning_rate": 1.6666666666666667e-06, + "loss": 2.321625232696533, + "step": 5 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 25.979670653733457, + "learning_rate": 2.0833333333333334e-06, + "loss": 2.1303162574768066, + "step": 6 + }, + { + "epoch": 0.08917197452229299, + "grad_norm": 18.68650885616665, + "learning_rate": 2.5e-06, + "loss": 1.8076802492141724, + "step": 7 + }, + { + "epoch": 0.10191082802547771, + "grad_norm": 16.7812576898942, + "learning_rate": 2.916666666666667e-06, + "loss": 1.7900886535644531, + "step": 8 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 14.18738912625846, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.7915903329849243, + "step": 9 + }, + { + "epoch": 0.12738853503184713, + "grad_norm": 14.116799195872652, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.8171511888504028, + "step": 10 + }, + { + "epoch": 0.14012738853503184, + "grad_norm": 10.442018372124066, + "learning_rate": 4.166666666666667e-06, + "loss": 1.7455570697784424, + "step": 11 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 9.192045468171578, + "learning_rate": 4.583333333333333e-06, + "loss": 1.6337864398956299, + "step": 12 + }, + { + "epoch": 0.16560509554140126, + "grad_norm": 8.260571303853565, + "learning_rate": 5e-06, + "loss": 1.778015375137329, + "step": 13 + }, + { + "epoch": 0.17834394904458598, + "grad_norm": 6.460613846297531, + "learning_rate": 5.416666666666667e-06, + "loss": 1.712306022644043, + "step": 14 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 6.843351540555302, + "learning_rate": 5.833333333333334e-06, + "loss": 1.5768513679504395, + "step": 15 + }, + { + "epoch": 0.20382165605095542, + "grad_norm": 6.133058262409406, + "learning_rate": 6.25e-06, + "loss": 1.5611257553100586, + "step": 16 + }, + { + "epoch": 0.21656050955414013, + "grad_norm": 5.180005658869054, + "learning_rate": 6.666666666666667e-06, + "loss": 1.5967652797698975, + "step": 17 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 5.305167134267678, + "learning_rate": 7.083333333333335e-06, + "loss": 1.364829659461975, + "step": 18 + }, + { + "epoch": 0.24203821656050956, + "grad_norm": 5.355870721587038, + "learning_rate": 7.500000000000001e-06, + "loss": 1.6430319547653198, + "step": 19 + }, + { + "epoch": 0.25477707006369427, + "grad_norm": 5.0292720888884075, + "learning_rate": 7.916666666666667e-06, + "loss": 1.5467270612716675, + "step": 20 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 4.948954166107489, + "learning_rate": 8.333333333333334e-06, + "loss": 1.5446631908416748, + "step": 21 + }, + { + "epoch": 0.2802547770700637, + "grad_norm": 4.859436510097199, + "learning_rate": 8.750000000000001e-06, + "loss": 1.4481780529022217, + "step": 22 + }, + { + "epoch": 0.2929936305732484, + "grad_norm": 5.3831221005725896, + "learning_rate": 9.166666666666666e-06, + "loss": 1.4933228492736816, + "step": 23 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 4.473608276014855, + "learning_rate": 9.583333333333335e-06, + "loss": 1.5807710886001587, + "step": 24 + }, + { + "epoch": 0.3184713375796178, + "grad_norm": 4.109425789809634, + "learning_rate": 1e-05, + "loss": 1.349104404449463, + "step": 25 + }, + { + "epoch": 0.33121019108280253, + "grad_norm": 4.634192438556494, + "learning_rate": 9.999456158087994e-06, + "loss": 1.4354019165039062, + "step": 26 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 5.0726515873395, + "learning_rate": 9.997824750657586e-06, + "loss": 1.566201090812683, + "step": 27 + }, + { + "epoch": 0.35668789808917195, + "grad_norm": 4.8805336123469205, + "learning_rate": 9.995106132599869e-06, + "loss": 1.411285161972046, + "step": 28 + }, + { + "epoch": 0.36942675159235666, + "grad_norm": 4.859867988307453, + "learning_rate": 9.99130089531422e-06, + "loss": 1.2867789268493652, + "step": 29 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 4.432994127396081, + "learning_rate": 9.98640986657965e-06, + "loss": 1.5999436378479004, + "step": 30 + }, + { + "epoch": 0.39490445859872614, + "grad_norm": 4.40223600447386, + "learning_rate": 9.980434110374725e-06, + "loss": 1.4318150281906128, + "step": 31 + }, + { + "epoch": 0.40764331210191085, + "grad_norm": 4.902591623548149, + "learning_rate": 9.973374926646117e-06, + "loss": 1.607371211051941, + "step": 32 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 4.909609907293681, + "learning_rate": 9.965233851025816e-06, + "loss": 1.443784236907959, + "step": 33 + }, + { + "epoch": 0.43312101910828027, + "grad_norm": 4.456375484305202, + "learning_rate": 9.956012654497073e-06, + "loss": 1.570559024810791, + "step": 34 + }, + { + "epoch": 0.445859872611465, + "grad_norm": 4.599861555148005, + "learning_rate": 9.945713343009154e-06, + "loss": 1.548865556716919, + "step": 35 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 4.315411715741126, + "learning_rate": 9.934338157040953e-06, + "loss": 1.4340442419052124, + "step": 36 + }, + { + "epoch": 0.4713375796178344, + "grad_norm": 4.598194925817704, + "learning_rate": 9.921889571113629e-06, + "loss": 1.5494410991668701, + "step": 37 + }, + { + "epoch": 0.4840764331210191, + "grad_norm": 4.243095090396253, + "learning_rate": 9.90837029325229e-06, + "loss": 1.4130847454071045, + "step": 38 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 4.980649623484297, + "learning_rate": 9.893783264396903e-06, + "loss": 1.4265036582946777, + "step": 39 + }, + { + "epoch": 0.5095541401273885, + "grad_norm": 5.789533896785179, + "learning_rate": 9.878131657762535e-06, + "loss": 1.4373618364334106, + "step": 40 + }, + { + "epoch": 0.5222929936305732, + "grad_norm": 4.658455364896436, + "learning_rate": 9.861418878149056e-06, + "loss": 1.4085681438446045, + "step": 41 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 5.324025858102516, + "learning_rate": 9.843648561200476e-06, + "loss": 1.452268123626709, + "step": 42 + }, + { + "epoch": 0.5477707006369427, + "grad_norm": 4.692010278942193, + "learning_rate": 9.82482457261405e-06, + "loss": 1.6110832691192627, + "step": 43 + }, + { + "epoch": 0.5605095541401274, + "grad_norm": 4.64177391127698, + "learning_rate": 9.80495100729936e-06, + "loss": 1.4959537982940674, + "step": 44 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 4.196745696549577, + "learning_rate": 9.784032188487507e-06, + "loss": 1.4369564056396484, + "step": 45 + }, + { + "epoch": 0.5859872611464968, + "grad_norm": 4.846501722779206, + "learning_rate": 9.762072666790658e-06, + "loss": 1.5659615993499756, + "step": 46 + }, + { + "epoch": 0.5987261146496815, + "grad_norm": 4.475752207148854, + "learning_rate": 9.73907721921212e-06, + "loss": 1.6203088760375977, + "step": 47 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 5.622443868901302, + "learning_rate": 9.715050848107167e-06, + "loss": 1.4394254684448242, + "step": 48 + }, + { + "epoch": 0.6242038216560509, + "grad_norm": 5.282103052112391, + "learning_rate": 9.689998780094839e-06, + "loss": 1.3903216123580933, + "step": 49 + }, + { + "epoch": 0.6369426751592356, + "grad_norm": 4.399414503844804, + "learning_rate": 9.663926464920959e-06, + "loss": 1.4829354286193848, + "step": 50 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 4.523444723393011, + "learning_rate": 9.636839574272623e-06, + "loss": 1.5274395942687988, + "step": 51 + }, + { + "epoch": 0.6624203821656051, + "grad_norm": 4.232538410435911, + "learning_rate": 9.608744000544392e-06, + "loss": 1.4694490432739258, + "step": 52 + }, + { + "epoch": 0.6751592356687898, + "grad_norm": 4.354507161235457, + "learning_rate": 9.579645855556481e-06, + "loss": 1.2353503704071045, + "step": 53 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 4.9180556110442595, + "learning_rate": 9.54955146922521e-06, + "loss": 1.4008901119232178, + "step": 54 + }, + { + "epoch": 0.7006369426751592, + "grad_norm": 4.970650879718249, + "learning_rate": 9.51846738818602e-06, + "loss": 1.3539741039276123, + "step": 55 + }, + { + "epoch": 0.7133757961783439, + "grad_norm": 4.837000418043291, + "learning_rate": 9.48640037436934e-06, + "loss": 1.3163714408874512, + "step": 56 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 3.9679039776262064, + "learning_rate": 9.453357403529609e-06, + "loss": 1.3809059858322144, + "step": 57 + }, + { + "epoch": 0.7388535031847133, + "grad_norm": 4.462452198138723, + "learning_rate": 9.419345663727805e-06, + "loss": 1.458146572113037, + "step": 58 + }, + { + "epoch": 0.7515923566878981, + "grad_norm": 4.402982081383035, + "learning_rate": 9.38437255376777e-06, + "loss": 1.329193115234375, + "step": 59 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 4.377947135685406, + "learning_rate": 9.348445681586703e-06, + "loss": 1.4500741958618164, + "step": 60 + }, + { + "epoch": 0.7770700636942676, + "grad_norm": 4.198026205959271, + "learning_rate": 9.31157286260014e-06, + "loss": 1.4562097787857056, + "step": 61 + }, + { + "epoch": 0.7898089171974523, + "grad_norm": 4.945285199299728, + "learning_rate": 9.273762118001837e-06, + "loss": 1.3661162853240967, + "step": 62 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 3.9573822911468266, + "learning_rate": 9.235021673018849e-06, + "loss": 1.3168445825576782, + "step": 63 + }, + { + "epoch": 0.8152866242038217, + "grad_norm": 4.566194937738093, + "learning_rate": 9.195359955122244e-06, + "loss": 1.3281530141830444, + "step": 64 + }, + { + "epoch": 0.8280254777070064, + "grad_norm": 4.657547711627972, + "learning_rate": 9.15478559219382e-06, + "loss": 1.3520253896713257, + "step": 65 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 4.931346013168586, + "learning_rate": 9.113307410649222e-06, + "loss": 1.4982115030288696, + "step": 66 + }, + { + "epoch": 0.8535031847133758, + "grad_norm": 4.205958958323587, + "learning_rate": 9.070934433517872e-06, + "loss": 1.402880311012268, + "step": 67 + }, + { + "epoch": 0.8662420382165605, + "grad_norm": 4.265262844911349, + "learning_rate": 9.027675878480131e-06, + "loss": 1.4359843730926514, + "step": 68 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 4.174081063602911, + "learning_rate": 8.983541155862114e-06, + "loss": 1.4095585346221924, + "step": 69 + }, + { + "epoch": 0.89171974522293, + "grad_norm": 4.300497916224527, + "learning_rate": 8.938539866588593e-06, + "loss": 1.3254384994506836, + "step": 70 + }, + { + "epoch": 0.9044585987261147, + "grad_norm": 4.321856820928802, + "learning_rate": 8.892681800094447e-06, + "loss": 1.3895121812820435, + "step": 71 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 4.353418264893862, + "learning_rate": 8.845976932195104e-06, + "loss": 1.5136423110961914, + "step": 72 + }, + { + "epoch": 0.9299363057324841, + "grad_norm": 4.509525726327509, + "learning_rate": 8.798435422916425e-06, + "loss": 1.560758352279663, + "step": 73 + }, + { + "epoch": 0.9426751592356688, + "grad_norm": 4.14002546470194, + "learning_rate": 8.750067614284534e-06, + "loss": 1.2931057214736938, + "step": 74 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 4.1809249012997345, + "learning_rate": 8.700884028076042e-06, + "loss": 1.5124843120574951, + "step": 75 + }, + { + "epoch": 0.9681528662420382, + "grad_norm": 4.293885955875831, + "learning_rate": 8.650895363529172e-06, + "loss": 1.317713737487793, + "step": 76 + }, + { + "epoch": 0.9808917197452229, + "grad_norm": 4.631688789038691, + "learning_rate": 8.600112495016289e-06, + "loss": 1.3039919137954712, + "step": 77 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 4.27759477922895, + "learning_rate": 8.548546469678311e-06, + "loss": 1.495795488357544, + "step": 78 + }, + { + "epoch": 1.0, + "grad_norm": 5.956623944392664, + "learning_rate": 8.496208505021572e-06, + "loss": 1.429541826248169, + "step": 79 + }, + { + "epoch": 1.0127388535031847, + "grad_norm": 4.45796601634621, + "learning_rate": 8.443109986477574e-06, + "loss": 0.8995598554611206, + "step": 80 + }, + { + "epoch": 1.0254777070063694, + "grad_norm": 4.500661347515663, + "learning_rate": 8.389262464926256e-06, + "loss": 0.63990318775177, + "step": 81 + }, + { + "epoch": 1.0382165605095541, + "grad_norm": 3.8748063820496257, + "learning_rate": 8.334677654183254e-06, + "loss": 0.6055729985237122, + "step": 82 + }, + { + "epoch": 1.0509554140127388, + "grad_norm": 3.4628880356772096, + "learning_rate": 8.279367428451703e-06, + "loss": 0.7356538772583008, + "step": 83 + }, + { + "epoch": 1.0636942675159236, + "grad_norm": 3.6516848444686265, + "learning_rate": 8.223343819739164e-06, + "loss": 0.692323625087738, + "step": 84 + }, + { + "epoch": 1.0764331210191083, + "grad_norm": 3.968197707946131, + "learning_rate": 8.166619015240236e-06, + "loss": 0.6772887706756592, + "step": 85 + }, + { + "epoch": 1.089171974522293, + "grad_norm": 3.845941294941666, + "learning_rate": 8.109205354685367e-06, + "loss": 0.5514630675315857, + "step": 86 + }, + { + "epoch": 1.1019108280254777, + "grad_norm": 3.774618366335066, + "learning_rate": 8.051115327656538e-06, + "loss": 0.6684471964836121, + "step": 87 + }, + { + "epoch": 1.1146496815286624, + "grad_norm": 3.7047992437252, + "learning_rate": 7.992361570870289e-06, + "loss": 0.5766518712043762, + "step": 88 + }, + { + "epoch": 1.127388535031847, + "grad_norm": 4.214676734133472, + "learning_rate": 7.932956865428792e-06, + "loss": 0.5921903848648071, + "step": 89 + }, + { + "epoch": 1.1401273885350318, + "grad_norm": 4.387324313211908, + "learning_rate": 7.872914134039485e-06, + "loss": 0.592995285987854, + "step": 90 + }, + { + "epoch": 1.1528662420382165, + "grad_norm": 4.005865244271663, + "learning_rate": 7.812246438203905e-06, + "loss": 0.5482683181762695, + "step": 91 + }, + { + "epoch": 1.1656050955414012, + "grad_norm": 4.636384134136274, + "learning_rate": 7.750966975376328e-06, + "loss": 0.6826972365379333, + "step": 92 + }, + { + "epoch": 1.178343949044586, + "grad_norm": 4.6953248944517245, + "learning_rate": 7.689089076092851e-06, + "loss": 0.5954027771949768, + "step": 93 + }, + { + "epoch": 1.1910828025477707, + "grad_norm": 4.80616798771938, + "learning_rate": 7.626626201071494e-06, + "loss": 0.6095083355903625, + "step": 94 + }, + { + "epoch": 1.2038216560509554, + "grad_norm": 4.48252749973364, + "learning_rate": 7.563591938284012e-06, + "loss": 0.709877610206604, + "step": 95 + }, + { + "epoch": 1.21656050955414, + "grad_norm": 4.596613533967055, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5784502029418945, + "step": 96 + }, + { + "epoch": 1.2292993630573248, + "grad_norm": 4.76402532258561, + "learning_rate": 7.4358642198039835e-06, + "loss": 0.5837893486022949, + "step": 97 + }, + { + "epoch": 1.2420382165605095, + "grad_norm": 4.440144626730792, + "learning_rate": 7.371198549586091e-06, + "loss": 0.7246421575546265, + "step": 98 + }, + { + "epoch": 1.2547770700636942, + "grad_norm": 4.1554626239444605, + "learning_rate": 7.306017056507018e-06, + "loss": 0.5735586285591125, + "step": 99 + }, + { + "epoch": 1.267515923566879, + "grad_norm": 5.004892398076429, + "learning_rate": 7.240333919937893e-06, + "loss": 0.5463488101959229, + "step": 100 + }, + { + "epoch": 1.2802547770700636, + "grad_norm": 5.088476554254515, + "learning_rate": 7.174163428375748e-06, + "loss": 0.5633252859115601, + "step": 101 + }, + { + "epoch": 1.2929936305732483, + "grad_norm": 5.118792774795437, + "learning_rate": 7.107519976335241e-06, + "loss": 0.5037230253219604, + "step": 102 + }, + { + "epoch": 1.305732484076433, + "grad_norm": 4.75623015993911, + "learning_rate": 7.040418061217325e-06, + "loss": 0.5365867614746094, + "step": 103 + }, + { + "epoch": 1.3184713375796178, + "grad_norm": 5.157812619262671, + "learning_rate": 6.972872280155528e-06, + "loss": 0.6433064937591553, + "step": 104 + }, + { + "epoch": 1.3312101910828025, + "grad_norm": 5.148365945239476, + "learning_rate": 6.9048973268405375e-06, + "loss": 0.6543390154838562, + "step": 105 + }, + { + "epoch": 1.3439490445859872, + "grad_norm": 4.5311604864334125, + "learning_rate": 6.836507988323785e-06, + "loss": 0.6132720708847046, + "step": 106 + }, + { + "epoch": 1.356687898089172, + "grad_norm": 4.88971282799509, + "learning_rate": 6.767719141800718e-06, + "loss": 0.6079248189926147, + "step": 107 + }, + { + "epoch": 1.3694267515923566, + "grad_norm": 4.695137801905107, + "learning_rate": 6.698545751374465e-06, + "loss": 0.6232650279998779, + "step": 108 + }, + { + "epoch": 1.3821656050955413, + "grad_norm": 4.267620300562347, + "learning_rate": 6.629002864800589e-06, + "loss": 0.5911256074905396, + "step": 109 + }, + { + "epoch": 1.394904458598726, + "grad_norm": 5.30166521900121, + "learning_rate": 6.55910561021365e-06, + "loss": 0.6404790282249451, + "step": 110 + }, + { + "epoch": 1.4076433121019107, + "grad_norm": 4.964813016784396, + "learning_rate": 6.488869192836279e-06, + "loss": 0.6661736965179443, + "step": 111 + }, + { + "epoch": 1.4203821656050954, + "grad_norm": 5.042827042141295, + "learning_rate": 6.418308891671484e-06, + "loss": 0.5621084570884705, + "step": 112 + }, + { + "epoch": 1.4331210191082802, + "grad_norm": 4.554306311318436, + "learning_rate": 6.347440056178904e-06, + "loss": 0.5913956165313721, + "step": 113 + }, + { + "epoch": 1.4458598726114649, + "grad_norm": 3.999260338697589, + "learning_rate": 6.27627810293574e-06, + "loss": 0.5895659327507019, + "step": 114 + }, + { + "epoch": 1.4585987261146496, + "grad_norm": 4.488817511346429, + "learning_rate": 6.204838512283073e-06, + "loss": 0.6066327691078186, + "step": 115 + }, + { + "epoch": 1.4713375796178343, + "grad_norm": 4.2048895000167725, + "learning_rate": 6.133136824958334e-06, + "loss": 0.579125165939331, + "step": 116 + }, + { + "epoch": 1.484076433121019, + "grad_norm": 4.865801929274413, + "learning_rate": 6.061188638714616e-06, + "loss": 0.5661747455596924, + "step": 117 + }, + { + "epoch": 1.4968152866242037, + "grad_norm": 4.0216175803478365, + "learning_rate": 5.989009604927587e-06, + "loss": 0.5881543159484863, + "step": 118 + }, + { + "epoch": 1.5095541401273884, + "grad_norm": 4.672593821116511, + "learning_rate": 5.916615425190744e-06, + "loss": 0.6381370425224304, + "step": 119 + }, + { + "epoch": 1.5222929936305731, + "grad_norm": 4.579578448838088, + "learning_rate": 5.844021847899735e-06, + "loss": 0.5820121765136719, + "step": 120 + }, + { + "epoch": 1.5350318471337578, + "grad_norm": 4.9782337341335845, + "learning_rate": 5.771244664826512e-06, + "loss": 0.5244691371917725, + "step": 121 + }, + { + "epoch": 1.5477707006369426, + "grad_norm": 4.615280693095074, + "learning_rate": 5.698299707684031e-06, + "loss": 0.6596621870994568, + "step": 122 + }, + { + "epoch": 1.5605095541401273, + "grad_norm": 4.1315152129695205, + "learning_rate": 5.6252028446822805e-06, + "loss": 0.6240249872207642, + "step": 123 + }, + { + "epoch": 1.573248407643312, + "grad_norm": 4.34694030117767, + "learning_rate": 5.55196997707635e-06, + "loss": 0.6121684312820435, + "step": 124 + }, + { + "epoch": 1.5859872611464967, + "grad_norm": 4.736014683349439, + "learning_rate": 5.478617035707337e-06, + "loss": 0.581444263458252, + "step": 125 + }, + { + "epoch": 1.5987261146496814, + "grad_norm": 4.42473315063519, + "learning_rate": 5.4051599775368e-06, + "loss": 0.5702801942825317, + "step": 126 + }, + { + "epoch": 1.611464968152866, + "grad_norm": 4.723043711831375, + "learning_rate": 5.33161478217552e-06, + "loss": 0.643683671951294, + "step": 127 + }, + { + "epoch": 1.6242038216560508, + "grad_norm": 4.615535634313775, + "learning_rate": 5.257997448407366e-06, + "loss": 0.6429088115692139, + "step": 128 + }, + { + "epoch": 1.6369426751592355, + "grad_norm": 3.943237517267742, + "learning_rate": 5.184323990708959e-06, + "loss": 0.5036097764968872, + "step": 129 + }, + { + "epoch": 1.6496815286624202, + "grad_norm": 5.369249891502365, + "learning_rate": 5.110610435765935e-06, + "loss": 0.6377817392349243, + "step": 130 + }, + { + "epoch": 1.662420382165605, + "grad_norm": 4.645492978424057, + "learning_rate": 5.0368728189865624e-06, + "loss": 0.5092718601226807, + "step": 131 + }, + { + "epoch": 1.6751592356687897, + "grad_norm": 4.9878218164552255, + "learning_rate": 4.9631271810134375e-06, + "loss": 0.6005362868309021, + "step": 132 + }, + { + "epoch": 1.6878980891719744, + "grad_norm": 4.710856517549427, + "learning_rate": 4.8893895642340665e-06, + "loss": 0.4808087944984436, + "step": 133 + }, + { + "epoch": 1.700636942675159, + "grad_norm": 4.962556354741984, + "learning_rate": 4.815676009291044e-06, + "loss": 0.6739586591720581, + "step": 134 + }, + { + "epoch": 1.7133757961783438, + "grad_norm": 4.864043235726367, + "learning_rate": 4.742002551592635e-06, + "loss": 0.5722870826721191, + "step": 135 + }, + { + "epoch": 1.7261146496815285, + "grad_norm": 5.805499130195261, + "learning_rate": 4.668385217824482e-06, + "loss": 0.5560994148254395, + "step": 136 + }, + { + "epoch": 1.7388535031847132, + "grad_norm": 4.3754614924647734, + "learning_rate": 4.594840022463201e-06, + "loss": 0.6376844644546509, + "step": 137 + }, + { + "epoch": 1.7515923566878981, + "grad_norm": 4.6276825029066515, + "learning_rate": 4.5213829642926635e-06, + "loss": 0.5070189237594604, + "step": 138 + }, + { + "epoch": 1.7643312101910829, + "grad_norm": 5.058486321341029, + "learning_rate": 4.4480300229236525e-06, + "loss": 0.6301469206809998, + "step": 139 + }, + { + "epoch": 1.7770700636942676, + "grad_norm": 4.631581699502946, + "learning_rate": 4.374797155317721e-06, + "loss": 0.5686060190200806, + "step": 140 + }, + { + "epoch": 1.7898089171974523, + "grad_norm": 4.839930377645928, + "learning_rate": 4.30170029231597e-06, + "loss": 0.5702610015869141, + "step": 141 + }, + { + "epoch": 1.802547770700637, + "grad_norm": 4.634251405852573, + "learning_rate": 4.228755335173488e-06, + "loss": 0.5375156402587891, + "step": 142 + }, + { + "epoch": 1.8152866242038217, + "grad_norm": 5.224378872859397, + "learning_rate": 4.155978152100266e-06, + "loss": 0.588652491569519, + "step": 143 + }, + { + "epoch": 1.8280254777070064, + "grad_norm": 5.243980650196693, + "learning_rate": 4.0833845748092586e-06, + "loss": 0.6560136079788208, + "step": 144 + }, + { + "epoch": 1.8407643312101911, + "grad_norm": 4.870640612365541, + "learning_rate": 4.010990395072414e-06, + "loss": 0.5707780718803406, + "step": 145 + }, + { + "epoch": 1.8535031847133758, + "grad_norm": 4.896770007248889, + "learning_rate": 3.938811361285386e-06, + "loss": 0.578855574131012, + "step": 146 + }, + { + "epoch": 1.8662420382165605, + "grad_norm": 5.621832570155973, + "learning_rate": 3.866863175041666e-06, + "loss": 0.7337894439697266, + "step": 147 + }, + { + "epoch": 1.8789808917197452, + "grad_norm": 4.788974930837312, + "learning_rate": 3.7951614877169285e-06, + "loss": 0.6584663391113281, + "step": 148 + }, + { + "epoch": 1.89171974522293, + "grad_norm": 5.197175599878351, + "learning_rate": 3.7237218970642624e-06, + "loss": 0.5132451057434082, + "step": 149 + }, + { + "epoch": 1.9044585987261147, + "grad_norm": 4.494637523697752, + "learning_rate": 3.6525599438210956e-06, + "loss": 0.5699691772460938, + "step": 150 + }, + { + "epoch": 1.9171974522292994, + "grad_norm": 4.436597339850294, + "learning_rate": 3.5816911083285165e-06, + "loss": 0.6117175817489624, + "step": 151 + }, + { + "epoch": 1.929936305732484, + "grad_norm": 4.71698618164443, + "learning_rate": 3.511130807163724e-06, + "loss": 0.48447686433792114, + "step": 152 + }, + { + "epoch": 1.9426751592356688, + "grad_norm": 4.586270355395819, + "learning_rate": 3.440894389786352e-06, + "loss": 0.5775331854820251, + "step": 153 + }, + { + "epoch": 1.9554140127388535, + "grad_norm": 5.467603736362664, + "learning_rate": 3.370997135199413e-06, + "loss": 0.6822047829627991, + "step": 154 + }, + { + "epoch": 1.9681528662420382, + "grad_norm": 5.092809942708443, + "learning_rate": 3.3014542486255365e-06, + "loss": 0.620025098323822, + "step": 155 + }, + { + "epoch": 1.980891719745223, + "grad_norm": 4.782057480529959, + "learning_rate": 3.2322808581992825e-06, + "loss": 0.6051990985870361, + "step": 156 + }, + { + "epoch": 1.9936305732484076, + "grad_norm": 5.07119310501042, + "learning_rate": 3.1634920116762175e-06, + "loss": 0.5013089776039124, + "step": 157 + }, + { + "epoch": 2.0, + "grad_norm": 5.834245362327659, + "learning_rate": 3.0951026731594634e-06, + "loss": 0.41039198637008667, + "step": 158 + }, + { + "epoch": 2.0127388535031847, + "grad_norm": 2.970713570403218, + "learning_rate": 3.0271277198444737e-06, + "loss": 0.14488917589187622, + "step": 159 + }, + { + "epoch": 2.0254777070063694, + "grad_norm": 3.3900669209478917, + "learning_rate": 2.9595819387826753e-06, + "loss": 0.17139403522014618, + "step": 160 + }, + { + "epoch": 2.038216560509554, + "grad_norm": 3.148172373199878, + "learning_rate": 2.89248002366476e-06, + "loss": 0.13938947021961212, + "step": 161 + }, + { + "epoch": 2.050955414012739, + "grad_norm": 3.292222772844883, + "learning_rate": 2.8258365716242543e-06, + "loss": 0.19142913818359375, + "step": 162 + }, + { + "epoch": 2.0636942675159236, + "grad_norm": 5.062552654446493, + "learning_rate": 2.7596660800621076e-06, + "loss": 0.32667019963264465, + "step": 163 + }, + { + "epoch": 2.0764331210191083, + "grad_norm": 2.9195663792104853, + "learning_rate": 2.6939829434929834e-06, + "loss": 0.16923490166664124, + "step": 164 + }, + { + "epoch": 2.089171974522293, + "grad_norm": 2.660735105353199, + "learning_rate": 2.6288014504139104e-06, + "loss": 0.16544359922409058, + "step": 165 + }, + { + "epoch": 2.1019108280254777, + "grad_norm": 2.9195377278173438, + "learning_rate": 2.5641357801960186e-06, + "loss": 0.13166563212871552, + "step": 166 + }, + { + "epoch": 2.1146496815286624, + "grad_norm": 2.7115850726819133, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.1502484679222107, + "step": 167 + }, + { + "epoch": 2.127388535031847, + "grad_norm": 2.5246541477672957, + "learning_rate": 2.4364080617159885e-06, + "loss": 0.12001603841781616, + "step": 168 + }, + { + "epoch": 2.140127388535032, + "grad_norm": 2.906306753932353, + "learning_rate": 2.373373798928507e-06, + "loss": 0.16388744115829468, + "step": 169 + }, + { + "epoch": 2.1528662420382165, + "grad_norm": 3.3313464695860855, + "learning_rate": 2.310910923907149e-06, + "loss": 0.17085227370262146, + "step": 170 + }, + { + "epoch": 2.1656050955414012, + "grad_norm": 3.537696001337278, + "learning_rate": 2.249033024623672e-06, + "loss": 0.1649709939956665, + "step": 171 + }, + { + "epoch": 2.178343949044586, + "grad_norm": 3.0477614078497157, + "learning_rate": 2.187753561796097e-06, + "loss": 0.13725437223911285, + "step": 172 + }, + { + "epoch": 2.1910828025477707, + "grad_norm": 3.108829906302373, + "learning_rate": 2.127085865960516e-06, + "loss": 0.14223095774650574, + "step": 173 + }, + { + "epoch": 2.2038216560509554, + "grad_norm": 3.188987721207745, + "learning_rate": 2.0670431345712092e-06, + "loss": 0.1432873010635376, + "step": 174 + }, + { + "epoch": 2.21656050955414, + "grad_norm": 3.5488199597897045, + "learning_rate": 2.0076384291297134e-06, + "loss": 0.1355983018875122, + "step": 175 + }, + { + "epoch": 2.229299363057325, + "grad_norm": 2.9979876656948483, + "learning_rate": 1.9488846723434646e-06, + "loss": 0.13247933983802795, + "step": 176 + }, + { + "epoch": 2.2420382165605095, + "grad_norm": 3.443337367597467, + "learning_rate": 1.890794645314633e-06, + "loss": 0.1308836191892624, + "step": 177 + }, + { + "epoch": 2.254777070063694, + "grad_norm": 4.121646470867133, + "learning_rate": 1.8333809847597644e-06, + "loss": 0.15963426232337952, + "step": 178 + }, + { + "epoch": 2.267515923566879, + "grad_norm": 4.118828059264668, + "learning_rate": 1.7766561802608374e-06, + "loss": 0.14805136620998383, + "step": 179 + }, + { + "epoch": 2.2802547770700636, + "grad_norm": 3.9708198011551166, + "learning_rate": 1.7206325715483003e-06, + "loss": 0.12024472653865814, + "step": 180 + }, + { + "epoch": 2.2929936305732483, + "grad_norm": 3.439106672469071, + "learning_rate": 1.665322345816746e-06, + "loss": 0.11454702913761139, + "step": 181 + }, + { + "epoch": 2.305732484076433, + "grad_norm": 3.4010452876916615, + "learning_rate": 1.6107375350737437e-06, + "loss": 0.10992666333913803, + "step": 182 + }, + { + "epoch": 2.3184713375796178, + "grad_norm": 3.5752577926580975, + "learning_rate": 1.556890013522428e-06, + "loss": 0.09631110727787018, + "step": 183 + }, + { + "epoch": 2.3312101910828025, + "grad_norm": 3.8387220728977343, + "learning_rate": 1.50379149497843e-06, + "loss": 0.14856451749801636, + "step": 184 + }, + { + "epoch": 2.343949044585987, + "grad_norm": 3.444989482317406, + "learning_rate": 1.4514535303216893e-06, + "loss": 0.09073778241872787, + "step": 185 + }, + { + "epoch": 2.356687898089172, + "grad_norm": 3.2622590339488124, + "learning_rate": 1.3998875049837141e-06, + "loss": 0.10596369206905365, + "step": 186 + }, + { + "epoch": 2.3694267515923566, + "grad_norm": 4.072722677232836, + "learning_rate": 1.3491046364708294e-06, + "loss": 0.1488298773765564, + "step": 187 + }, + { + "epoch": 2.3821656050955413, + "grad_norm": 4.114774744144093, + "learning_rate": 1.2991159719239581e-06, + "loss": 0.13143031299114227, + "step": 188 + }, + { + "epoch": 2.394904458598726, + "grad_norm": 3.792643277657603, + "learning_rate": 1.249932385715467e-06, + "loss": 0.12935219705104828, + "step": 189 + }, + { + "epoch": 2.4076433121019107, + "grad_norm": 3.6041653995445, + "learning_rate": 1.2015645770835765e-06, + "loss": 0.10895463824272156, + "step": 190 + }, + { + "epoch": 2.4203821656050954, + "grad_norm": 3.419036474508468, + "learning_rate": 1.1540230678048969e-06, + "loss": 0.11770664900541306, + "step": 191 + }, + { + "epoch": 2.43312101910828, + "grad_norm": 3.8473062967203626, + "learning_rate": 1.1073181999055538e-06, + "loss": 0.12943175435066223, + "step": 192 + }, + { + "epoch": 2.445859872611465, + "grad_norm": 4.213646564060963, + "learning_rate": 1.0614601334114099e-06, + "loss": 0.15990746021270752, + "step": 193 + }, + { + "epoch": 2.4585987261146496, + "grad_norm": 3.292740607382361, + "learning_rate": 1.016458844137887e-06, + "loss": 0.0967484638094902, + "step": 194 + }, + { + "epoch": 2.4713375796178343, + "grad_norm": 3.3587679937993675, + "learning_rate": 9.723241215198692e-07, + "loss": 0.09274256229400635, + "step": 195 + }, + { + "epoch": 2.484076433121019, + "grad_norm": 3.415144877613833, + "learning_rate": 9.290655664821296e-07, + "loss": 0.12071307003498077, + "step": 196 + }, + { + "epoch": 2.4968152866242037, + "grad_norm": 3.614520056467126, + "learning_rate": 8.866925893507805e-07, + "loss": 0.14337831735610962, + "step": 197 + }, + { + "epoch": 2.5095541401273884, + "grad_norm": 3.1413281076463333, + "learning_rate": 8.45214407806182e-07, + "loss": 0.1311374008655548, + "step": 198 + }, + { + "epoch": 2.522292993630573, + "grad_norm": 3.5634546960778963, + "learning_rate": 8.046400448777575e-07, + "loss": 0.12355434894561768, + "step": 199 + }, + { + "epoch": 2.535031847133758, + "grad_norm": 3.55245812518791, + "learning_rate": 7.649783269811523e-07, + "loss": 0.11268627643585205, + "step": 200 + }, + { + "epoch": 2.5477707006369426, + "grad_norm": 3.6047204962278205, + "learning_rate": 7.26237881998163e-07, + "loss": 0.1278030276298523, + "step": 201 + }, + { + "epoch": 2.5605095541401273, + "grad_norm": 3.826082333377558, + "learning_rate": 6.884271373998608e-07, + "loss": 0.11588963866233826, + "step": 202 + }, + { + "epoch": 2.573248407643312, + "grad_norm": 3.3477539285078044, + "learning_rate": 6.515543184133e-07, + "loss": 0.11168617010116577, + "step": 203 + }, + { + "epoch": 2.5859872611464967, + "grad_norm": 3.40070063216114, + "learning_rate": 6.156274462322292e-07, + "loss": 0.14677459001541138, + "step": 204 + }, + { + "epoch": 2.5987261146496814, + "grad_norm": 3.6867880675958333, + "learning_rate": 5.806543362721945e-07, + "loss": 0.1080314964056015, + "step": 205 + }, + { + "epoch": 2.611464968152866, + "grad_norm": 3.50805046104141, + "learning_rate": 5.466425964703914e-07, + "loss": 0.10917598009109497, + "step": 206 + }, + { + "epoch": 2.624203821656051, + "grad_norm": 3.744240792349818, + "learning_rate": 5.135996256306619e-07, + "loss": 0.10850804299116135, + "step": 207 + }, + { + "epoch": 2.6369426751592355, + "grad_norm": 3.2636204792288184, + "learning_rate": 4.815326118139813e-07, + "loss": 0.23395496606826782, + "step": 208 + }, + { + "epoch": 2.6496815286624202, + "grad_norm": 3.3320803212307895, + "learning_rate": 4.5044853077479134e-07, + "loss": 0.09678040444850922, + "step": 209 + }, + { + "epoch": 2.662420382165605, + "grad_norm": 3.3126443611241005, + "learning_rate": 4.203541444435211e-07, + "loss": 0.09082137048244476, + "step": 210 + }, + { + "epoch": 2.6751592356687897, + "grad_norm": 3.463640048859196, + "learning_rate": 3.9125599945560866e-07, + "loss": 0.12093393504619598, + "step": 211 + }, + { + "epoch": 2.6878980891719744, + "grad_norm": 4.1484131801868225, + "learning_rate": 3.631604257273774e-07, + "loss": 0.12841008603572845, + "step": 212 + }, + { + "epoch": 2.700636942675159, + "grad_norm": 3.4819962567564544, + "learning_rate": 3.360735350790428e-07, + "loss": 0.1454203575849533, + "step": 213 + }, + { + "epoch": 2.713375796178344, + "grad_norm": 3.3217850732913834, + "learning_rate": 3.100012199051627e-07, + "loss": 0.12103286385536194, + "step": 214 + }, + { + "epoch": 2.7261146496815285, + "grad_norm": 3.4551976218750706, + "learning_rate": 2.8494915189283325e-07, + "loss": 0.13519585132598877, + "step": 215 + }, + { + "epoch": 2.738853503184713, + "grad_norm": 3.5046747113231738, + "learning_rate": 2.6092278078788004e-07, + "loss": 0.14792990684509277, + "step": 216 + }, + { + "epoch": 2.7515923566878984, + "grad_norm": 4.057009589896516, + "learning_rate": 2.3792733320934348e-07, + "loss": 0.1573294997215271, + "step": 217 + }, + { + "epoch": 2.7643312101910826, + "grad_norm": 3.485812762552763, + "learning_rate": 2.1596781151249524e-07, + "loss": 0.15241427719593048, + "step": 218 + }, + { + "epoch": 2.777070063694268, + "grad_norm": 2.8563228482207395, + "learning_rate": 1.9504899270064105e-07, + "loss": 0.11122366786003113, + "step": 219 + }, + { + "epoch": 2.789808917197452, + "grad_norm": 3.219771759621168, + "learning_rate": 1.7517542738595071e-07, + "loss": 0.11351308226585388, + "step": 220 + }, + { + "epoch": 2.802547770700637, + "grad_norm": 3.4195554560904107, + "learning_rate": 1.5635143879952575e-07, + "loss": 0.1188071146607399, + "step": 221 + }, + { + "epoch": 2.8152866242038215, + "grad_norm": 2.9103932269106374, + "learning_rate": 1.3858112185094418e-07, + "loss": 0.1164408028125763, + "step": 222 + }, + { + "epoch": 2.8280254777070066, + "grad_norm": 3.6450799822214144, + "learning_rate": 1.2186834223746612e-07, + "loss": 0.12760576605796814, + "step": 223 + }, + { + "epoch": 2.840764331210191, + "grad_norm": 3.3225130395239253, + "learning_rate": 1.0621673560309798e-07, + "loss": 0.11487654596567154, + "step": 224 + }, + { + "epoch": 2.853503184713376, + "grad_norm": 3.2905886122232397, + "learning_rate": 9.162970674771177e-08, + "loss": 0.11246581375598907, + "step": 225 + }, + { + "epoch": 2.8662420382165603, + "grad_norm": 3.504394500719592, + "learning_rate": 7.81104288863721e-08, + "loss": 0.09955516457557678, + "step": 226 + }, + { + "epoch": 2.8789808917197455, + "grad_norm": 3.216564908375023, + "learning_rate": 6.566184295904777e-08, + "loss": 0.12330685555934906, + "step": 227 + }, + { + "epoch": 2.8917197452229297, + "grad_norm": 3.607447433445088, + "learning_rate": 5.4286656990847897e-08, + "loss": 0.12849846482276917, + "step": 228 + }, + { + "epoch": 2.904458598726115, + "grad_norm": 3.3244783180187425, + "learning_rate": 4.398734550292716e-08, + "loss": 0.11019767820835114, + "step": 229 + }, + { + "epoch": 2.917197452229299, + "grad_norm": 3.1359379558395957, + "learning_rate": 3.476614897418573e-08, + "loss": 0.10802481323480606, + "step": 230 + }, + { + "epoch": 2.9299363057324843, + "grad_norm": 3.236602655895111, + "learning_rate": 2.6625073353884756e-08, + "loss": 0.11602732539176941, + "step": 231 + }, + { + "epoch": 2.9426751592356686, + "grad_norm": 3.2263437658209133, + "learning_rate": 1.9565889625275945e-08, + "loss": 0.12483286112546921, + "step": 232 + }, + { + "epoch": 2.9554140127388537, + "grad_norm": 3.4340551157608235, + "learning_rate": 1.3590133420350315e-08, + "loss": 0.10575878620147705, + "step": 233 + }, + { + "epoch": 2.968152866242038, + "grad_norm": 3.903227901454765, + "learning_rate": 8.699104685779835e-09, + "loss": 0.14583438634872437, + "step": 234 + }, + { + "epoch": 2.980891719745223, + "grad_norm": 3.884552247317161, + "learning_rate": 4.89386740013198e-09, + "loss": 0.12648674845695496, + "step": 235 + }, + { + "epoch": 2.9936305732484074, + "grad_norm": 3.2859302150161747, + "learning_rate": 2.1752493424148647e-09, + "loss": 0.1414915770292282, + "step": 236 + }, + { + "epoch": 3.0, + "grad_norm": 2.7515804191453306, + "learning_rate": 5.438419120062933e-10, + "loss": 0.0598013773560524, + "step": 237 + }, + { + "epoch": 3.0, + "step": 237, + "total_flos": 4888319754240.0, + "train_loss": 0.7582035779575759, + "train_runtime": 573.8365, + "train_samples_per_second": 26.14, + "train_steps_per_second": 0.413 + } + ], + "logging_steps": 1, + "max_steps": 237, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4888319754240.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..e3139e6 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a0f9bbe90dd6f50c9149ab33f36daaea28a8cb7418054074ca73a45b2b85f1 +size 6968 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..d3fd7be Binary files /dev/null and b/training_loss.png differ