commit c00b0b999e6a8cc8f02d936cd2320910fb10c91e Author: ModelHub XC Date: Sat Apr 25 13:33:36 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: aasim-m/daft-qwen2.5-coder-3b-instruct-full-loss-0.02 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..f3ef2cb --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +library_name: transformers +license: other +base_model: Qwen/Qwen2.5-Coder-3B-Instruct +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: train_14_2_26_03_20 + results: [] +--- + + + +# train_14_2_26_03_20 + +This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct) on the daft_functions_dedup_sharegpt dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0001 +- train_batch_size: 1 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 16 +- total_train_batch_size: 64 +- total_eval_batch_size: 32 +- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 0.1 +- num_epochs: 3.0 + +### Training results + + + +### Framework versions + +- Transformers 5.2.0 +- Pytorch 2.8.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.22.2 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..d3b2dd1 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 1577088536150016.0, + "train_loss": 0.06165807318099385, + "train_runtime": 23128.4215, + "train_samples_per_second": 2.881, + "train_steps_per_second": 0.045 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/config.json b/config.json new file mode 100644 index 0000000..63534fe --- /dev/null +++ b/config.json @@ -0,0 +1,69 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 11008, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen2", + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 2, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..47205fb --- /dev/null +++ b/generation_config.json @@ -0,0 +1,13 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.2.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..f4ffe94 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0944b09542d7d4f5d32e182ae54135596909c3aef820978b74070495b66ad7a +size 6171927112 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..34510ff --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..dc55654 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,30 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": { + "<|im_start|>": "<|im_start|>", + "<|im_end|>": "<|im_end|>", + "<|object_ref_start|>": "<|object_ref_start|>", + "<|object_ref_end|>": "<|object_ref_end|>", + "<|box_start|>": "<|box_start|>", + "<|box_end|>": "<|box_end|>", + "<|quad_start|>": "<|quad_start|>", + "<|quad_end|>": "<|quad_end|>", + "<|vision_start|>": "<|vision_start|>", + "<|vision_end|>": "<|vision_end|>", + "<|vision_pad|>": "<|vision_pad|>", + "<|image_pad|>": "<|image_pad|>", + "<|video_pad|>": "<|video_pad|>" + }, + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..d3b2dd1 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 1577088536150016.0, + "train_loss": 0.06165807318099385, + "train_runtime": 23128.4215, + "train_samples_per_second": 2.881, + "train_steps_per_second": 0.045 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..41969c0 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,209 @@ +{"current_steps": 5, "total_steps": 1044, "loss": 0.7567409992218017, "lr": 3.8095238095238102e-06, "epoch": 0.014406627048442283, "percentage": 0.48, "elapsed_time": "0:01:52", "remaining_time": "6:30:24"} +{"current_steps": 10, "total_steps": 1044, "loss": 0.6131507873535156, "lr": 8.571428571428573e-06, "epoch": 0.028813254096884566, "percentage": 0.96, "elapsed_time": "0:03:41", "remaining_time": "6:21:56"} +{"current_steps": 15, "total_steps": 1044, "loss": 0.4174152374267578, "lr": 1.3333333333333333e-05, "epoch": 0.04321988114532685, "percentage": 1.44, "elapsed_time": "0:05:31", "remaining_time": "6:19:32"} +{"current_steps": 20, "total_steps": 1044, "loss": 0.3269367218017578, "lr": 1.8095238095238094e-05, "epoch": 0.05762650819376913, "percentage": 1.92, "elapsed_time": "0:07:22", "remaining_time": "6:17:51"} +{"current_steps": 25, "total_steps": 1044, "loss": 0.2579814910888672, "lr": 2.2857142857142858e-05, "epoch": 0.07203313524221142, "percentage": 2.39, "elapsed_time": "0:09:14", "remaining_time": "6:16:39"} +{"current_steps": 30, "total_steps": 1044, "loss": 0.22403466701507568, "lr": 2.7619047619047622e-05, "epoch": 0.0864397622906537, "percentage": 2.87, "elapsed_time": "0:11:06", "remaining_time": "6:15:41"} +{"current_steps": 35, "total_steps": 1044, "loss": 0.19430849552154542, "lr": 3.2380952380952386e-05, "epoch": 0.10084638933909598, "percentage": 3.35, "elapsed_time": "0:12:57", "remaining_time": "6:13:48"} +{"current_steps": 40, "total_steps": 1044, "loss": 0.17950987815856934, "lr": 3.7142857142857143e-05, "epoch": 0.11525301638753827, "percentage": 3.83, "elapsed_time": "0:14:48", "remaining_time": "6:11:44"} +{"current_steps": 45, "total_steps": 1044, "loss": 0.16267883777618408, "lr": 4.190476190476191e-05, "epoch": 0.12965964343598055, "percentage": 4.31, "elapsed_time": "0:16:38", "remaining_time": "6:09:37"} +{"current_steps": 50, "total_steps": 1044, "loss": 0.15465893745422363, "lr": 4.666666666666667e-05, "epoch": 0.14406627048442283, "percentage": 4.79, "elapsed_time": "0:18:28", "remaining_time": "6:07:22"} +{"current_steps": 55, "total_steps": 1044, "loss": 0.1339721441268921, "lr": 5.142857142857143e-05, "epoch": 0.15847289753286511, "percentage": 5.27, "elapsed_time": "0:20:18", "remaining_time": "6:05:09"} +{"current_steps": 60, "total_steps": 1044, "loss": 0.12487690448760987, "lr": 5.619047619047619e-05, "epoch": 0.1728795245813074, "percentage": 5.75, "elapsed_time": "0:22:06", "remaining_time": "6:02:42"} +{"current_steps": 65, "total_steps": 1044, "loss": 0.12374393939971924, "lr": 6.0952380952380964e-05, "epoch": 0.18728615162974968, "percentage": 6.23, "elapsed_time": "0:23:57", "remaining_time": "6:00:49"} +{"current_steps": 70, "total_steps": 1044, "loss": 0.11958651542663574, "lr": 6.571428571428571e-05, "epoch": 0.20169277867819196, "percentage": 6.7, "elapsed_time": "0:25:48", "remaining_time": "5:59:00"} +{"current_steps": 75, "total_steps": 1044, "loss": 0.1187552571296692, "lr": 7.047619047619048e-05, "epoch": 0.21609940572663425, "percentage": 7.18, "elapsed_time": "0:27:37", "remaining_time": "5:56:55"} +{"current_steps": 80, "total_steps": 1044, "loss": 0.11327266693115234, "lr": 7.523809523809524e-05, "epoch": 0.23050603277507653, "percentage": 7.66, "elapsed_time": "0:29:28", "remaining_time": "5:55:07"} +{"current_steps": 85, "total_steps": 1044, "loss": 0.10409235954284668, "lr": 8e-05, "epoch": 0.2449126598235188, "percentage": 8.14, "elapsed_time": "0:31:18", "remaining_time": "5:53:12"} +{"current_steps": 90, "total_steps": 1044, "loss": 0.10459071397781372, "lr": 8.476190476190477e-05, "epoch": 0.2593192868719611, "percentage": 8.62, "elapsed_time": "0:33:09", "remaining_time": "5:51:29"} +{"current_steps": 95, "total_steps": 1044, "loss": 0.10260200500488281, "lr": 8.952380952380953e-05, "epoch": 0.2737259139204034, "percentage": 9.1, "elapsed_time": "0:35:00", "remaining_time": "5:49:41"} +{"current_steps": 100, "total_steps": 1044, "loss": 0.15102967023849487, "lr": 9.428571428571429e-05, "epoch": 0.28813254096884566, "percentage": 9.58, "elapsed_time": "0:36:52", "remaining_time": "5:48:09"} +{"current_steps": 105, "total_steps": 1044, "loss": 0.21126885414123536, "lr": 9.904761904761905e-05, "epoch": 0.302539168017288, "percentage": 10.06, "elapsed_time": "0:38:42", "remaining_time": "5:46:07"} +{"current_steps": 110, "total_steps": 1044, "loss": 0.1489308714866638, "lr": 9.99955226394288e-05, "epoch": 0.31694579506573023, "percentage": 10.54, "elapsed_time": "0:40:32", "remaining_time": "5:44:16"} +{"current_steps": 115, "total_steps": 1044, "loss": 0.1149595022201538, "lr": 9.997733473639876e-05, "epoch": 0.33135242211417254, "percentage": 11.02, "elapsed_time": "0:42:24", "remaining_time": "5:42:32"} +{"current_steps": 120, "total_steps": 1044, "loss": 0.10596739053726197, "lr": 9.994516154152849e-05, "epoch": 0.3457590491626148, "percentage": 11.49, "elapsed_time": "0:44:14", "remaining_time": "5:40:42"} +{"current_steps": 125, "total_steps": 1044, "loss": 0.09920316338539123, "lr": 9.989901205792952e-05, "epoch": 0.3601656762110571, "percentage": 11.97, "elapsed_time": "0:46:07", "remaining_time": "5:39:05"} +{"current_steps": 130, "total_steps": 1044, "loss": 0.08917503952980041, "lr": 9.983889919973586e-05, "epoch": 0.37457230325949936, "percentage": 12.45, "elapsed_time": "0:47:58", "remaining_time": "5:37:17"} +{"current_steps": 135, "total_steps": 1044, "loss": 0.08957574367523194, "lr": 9.976483978849007e-05, "epoch": 0.3889789303079417, "percentage": 12.93, "elapsed_time": "0:49:49", "remaining_time": "5:35:31"} +{"current_steps": 140, "total_steps": 1044, "loss": 0.08561774492263793, "lr": 9.967685454843618e-05, "epoch": 0.40338555735638393, "percentage": 13.41, "elapsed_time": "0:51:41", "remaining_time": "5:33:43"} +{"current_steps": 145, "total_steps": 1044, "loss": 0.083004629611969, "lr": 9.957496810072027e-05, "epoch": 0.41779218440482624, "percentage": 13.89, "elapsed_time": "0:53:30", "remaining_time": "5:31:45"} +{"current_steps": 150, "total_steps": 1044, "loss": 0.0808147668838501, "lr": 9.945920895650071e-05, "epoch": 0.4321988114532685, "percentage": 14.37, "elapsed_time": "0:55:22", "remaining_time": "5:30:03"} +{"current_steps": 155, "total_steps": 1044, "loss": 0.07847496271133422, "lr": 9.932960950896981e-05, "epoch": 0.4466054385017108, "percentage": 14.85, "elapsed_time": "0:57:14", "remaining_time": "5:28:17"} +{"current_steps": 160, "total_steps": 1044, "loss": 0.07879123687744141, "lr": 9.918620602428915e-05, "epoch": 0.46101206555015306, "percentage": 15.33, "elapsed_time": "0:59:04", "remaining_time": "5:26:25"} +{"current_steps": 165, "total_steps": 1044, "loss": 0.07581273913383484, "lr": 9.902903863144107e-05, "epoch": 0.47541869259859537, "percentage": 15.8, "elapsed_time": "1:00:56", "remaining_time": "5:24:39"} +{"current_steps": 170, "total_steps": 1044, "loss": 0.07368603944778443, "lr": 9.885815131099934e-05, "epoch": 0.4898253196470376, "percentage": 16.28, "elapsed_time": "1:02:46", "remaining_time": "5:22:42"} +{"current_steps": 175, "total_steps": 1044, "loss": 0.06976621150970459, "lr": 9.867359188282192e-05, "epoch": 0.5042319466954799, "percentage": 16.76, "elapsed_time": "1:04:36", "remaining_time": "5:20:47"} +{"current_steps": 180, "total_steps": 1044, "loss": 0.07270271778106689, "lr": 9.847541199266941e-05, "epoch": 0.5186385737439222, "percentage": 17.24, "elapsed_time": "1:06:26", "remaining_time": "5:18:55"} +{"current_steps": 185, "total_steps": 1044, "loss": 0.06899308562278747, "lr": 9.826366709775286e-05, "epoch": 0.5330452007923645, "percentage": 17.72, "elapsed_time": "1:08:16", "remaining_time": "5:17:00"} +{"current_steps": 190, "total_steps": 1044, "loss": 0.06641653776168824, "lr": 9.803841645121504e-05, "epoch": 0.5474518278408068, "percentage": 18.2, "elapsed_time": "1:10:06", "remaining_time": "5:15:08"} +{"current_steps": 195, "total_steps": 1044, "loss": 0.06647136211395263, "lr": 9.779972308554952e-05, "epoch": 0.561858454889249, "percentage": 18.68, "elapsed_time": "1:11:57", "remaining_time": "5:13:15"} +{"current_steps": 200, "total_steps": 1044, "loss": 0.06856078505516053, "lr": 9.754765379496202e-05, "epoch": 0.5762650819376913, "percentage": 19.16, "elapsed_time": "1:13:50", "remaining_time": "5:11:35"} +{"current_steps": 205, "total_steps": 1044, "loss": 0.06724534034729004, "lr": 9.728227911667934e-05, "epoch": 0.5906717089861336, "percentage": 19.64, "elapsed_time": "1:15:40", "remaining_time": "5:09:41"} +{"current_steps": 210, "total_steps": 1044, "loss": 0.06738802194595336, "lr": 9.700367331121054e-05, "epoch": 0.605078336034576, "percentage": 20.11, "elapsed_time": "1:17:48", "remaining_time": "5:09:01"} +{"current_steps": 215, "total_steps": 1044, "loss": 0.07073606252670288, "lr": 9.67119143415667e-05, "epoch": 0.6194849630830181, "percentage": 20.59, "elapsed_time": "1:19:39", "remaining_time": "5:07:09"} +{"current_steps": 220, "total_steps": 1044, "loss": 0.06382153034210206, "lr": 9.640708385144403e-05, "epoch": 0.6338915901314605, "percentage": 21.07, "elapsed_time": "1:21:28", "remaining_time": "5:05:09"} +{"current_steps": 225, "total_steps": 1044, "loss": 0.06776301860809326, "lr": 9.608926714237754e-05, "epoch": 0.6482982171799028, "percentage": 21.55, "elapsed_time": "1:23:19", "remaining_time": "5:03:17"} +{"current_steps": 230, "total_steps": 1044, "loss": 0.06309446096420288, "lr": 9.575855314987068e-05, "epoch": 0.6627048442283451, "percentage": 22.03, "elapsed_time": "1:25:09", "remaining_time": "5:01:23"} +{"current_steps": 235, "total_steps": 1044, "loss": 0.06422497630119324, "lr": 9.541503441850843e-05, "epoch": 0.6771114712767873, "percentage": 22.51, "elapsed_time": "1:27:00", "remaining_time": "4:59:31"} +{"current_steps": 240, "total_steps": 1044, "loss": 0.06324135661125183, "lr": 9.505880707606024e-05, "epoch": 0.6915180983252296, "percentage": 22.99, "elapsed_time": "1:28:50", "remaining_time": "4:57:36"} +{"current_steps": 245, "total_steps": 1044, "loss": 0.06205494403839111, "lr": 9.468997080658031e-05, "epoch": 0.7059247253736719, "percentage": 23.47, "elapsed_time": "1:30:41", "remaining_time": "4:55:45"} +{"current_steps": 250, "total_steps": 1044, "loss": 0.057729125022888184, "lr": 9.430862882251278e-05, "epoch": 0.7203313524221142, "percentage": 23.95, "elapsed_time": "1:32:32", "remaining_time": "4:53:55"} +{"current_steps": 255, "total_steps": 1044, "loss": 0.059876751899719236, "lr": 9.391488783580955e-05, "epoch": 0.7347379794705564, "percentage": 24.43, "elapsed_time": "1:34:23", "remaining_time": "4:52:05"} +{"current_steps": 260, "total_steps": 1044, "loss": 0.05882802605628967, "lr": 9.350885802806863e-05, "epoch": 0.7491446065189987, "percentage": 24.9, "elapsed_time": "1:36:15", "remaining_time": "4:50:15"} +{"current_steps": 265, "total_steps": 1044, "loss": 0.06077917814254761, "lr": 9.309065301970193e-05, "epoch": 0.763551233567441, "percentage": 25.38, "elapsed_time": "1:38:06", "remaining_time": "4:48:23"} +{"current_steps": 270, "total_steps": 1044, "loss": 0.05303559303283691, "lr": 9.266038983814039e-05, "epoch": 0.7779578606158833, "percentage": 25.86, "elapsed_time": "1:39:55", "remaining_time": "4:46:28"} +{"current_steps": 275, "total_steps": 1044, "loss": 0.06124954223632813, "lr": 9.221818888508602e-05, "epoch": 0.7923644876643255, "percentage": 26.34, "elapsed_time": "1:41:46", "remaining_time": "4:44:35"} +{"current_steps": 280, "total_steps": 1044, "loss": 0.055888807773590087, "lr": 9.176417390281944e-05, "epoch": 0.8067711147127679, "percentage": 26.82, "elapsed_time": "1:43:36", "remaining_time": "4:42:42"} +{"current_steps": 285, "total_steps": 1044, "loss": 0.056972581148147586, "lr": 9.129847193957282e-05, "epoch": 0.8211777417612102, "percentage": 27.3, "elapsed_time": "1:45:27", "remaining_time": "4:40:50"} +{"current_steps": 290, "total_steps": 1044, "loss": 0.05824898481369019, "lr": 9.08212133139776e-05, "epoch": 0.8355843688096525, "percentage": 27.78, "elapsed_time": "1:47:18", "remaining_time": "4:39:00"} +{"current_steps": 295, "total_steps": 1044, "loss": 0.05415785312652588, "lr": 9.033253157859714e-05, "epoch": 0.8499909958580947, "percentage": 28.26, "elapsed_time": "1:49:09", "remaining_time": "4:37:09"} +{"current_steps": 300, "total_steps": 1044, "loss": 0.05467197895050049, "lr": 8.983256348255423e-05, "epoch": 0.864397622906537, "percentage": 28.74, "elapsed_time": "1:50:58", "remaining_time": "4:35:13"} +{"current_steps": 305, "total_steps": 1044, "loss": 0.06181464791297912, "lr": 8.932144893326432e-05, "epoch": 0.8788042499549793, "percentage": 29.21, "elapsed_time": "1:52:50", "remaining_time": "4:33:24"} +{"current_steps": 310, "total_steps": 1044, "loss": 0.05511963367462158, "lr": 8.879933095728485e-05, "epoch": 0.8932108770034216, "percentage": 29.69, "elapsed_time": "1:54:40", "remaining_time": "4:31:30"} +{"current_steps": 315, "total_steps": 1044, "loss": 0.05229709148406982, "lr": 8.826635566029166e-05, "epoch": 0.9076175040518638, "percentage": 30.17, "elapsed_time": "1:56:30", "remaining_time": "4:29:38"} +{"current_steps": 320, "total_steps": 1044, "loss": 0.05275582075119019, "lr": 8.772267218619388e-05, "epoch": 0.9220241311003061, "percentage": 30.65, "elapsed_time": "1:58:19", "remaining_time": "4:27:42"} +{"current_steps": 325, "total_steps": 1044, "loss": 0.05470834374427795, "lr": 8.716843267539869e-05, "epoch": 0.9364307581487484, "percentage": 31.13, "elapsed_time": "2:00:09", "remaining_time": "4:25:49"} +{"current_steps": 330, "total_steps": 1044, "loss": 0.05563476085662842, "lr": 8.660379222223727e-05, "epoch": 0.9508373851971907, "percentage": 31.61, "elapsed_time": "2:02:00", "remaining_time": "4:23:58"} +{"current_steps": 335, "total_steps": 1044, "loss": 0.054843342304229735, "lr": 8.602890883156454e-05, "epoch": 0.9652440122456329, "percentage": 32.09, "elapsed_time": "2:03:50", "remaining_time": "4:22:05"} +{"current_steps": 340, "total_steps": 1044, "loss": 0.05721263885498047, "lr": 8.544394337454409e-05, "epoch": 0.9796506392940753, "percentage": 32.57, "elapsed_time": "2:05:41", "remaining_time": "4:20:15"} +{"current_steps": 345, "total_steps": 1044, "loss": 0.05096786618232727, "lr": 8.484905954363123e-05, "epoch": 0.9940572663425176, "percentage": 33.05, "elapsed_time": "2:07:32", "remaining_time": "4:18:23"} +{"current_steps": 350, "total_steps": 1044, "loss": 0.05167339444160461, "lr": 8.424442380676647e-05, "epoch": 1.0057626508193769, "percentage": 33.52, "elapsed_time": "2:09:03", "remaining_time": "4:15:53"} +{"current_steps": 355, "total_steps": 1044, "loss": 0.05249757170677185, "lr": 8.363020536079239e-05, "epoch": 1.0201692778678193, "percentage": 34.0, "elapsed_time": "2:10:54", "remaining_time": "4:14:04"} +{"current_steps": 360, "total_steps": 1044, "loss": 0.05038872957229614, "lr": 8.300657608410678e-05, "epoch": 1.0345759049162615, "percentage": 34.48, "elapsed_time": "2:12:46", "remaining_time": "4:12:16"} +{"current_steps": 365, "total_steps": 1044, "loss": 0.050058400630950926, "lr": 8.237371048856546e-05, "epoch": 1.0489825319647037, "percentage": 34.96, "elapsed_time": "2:14:37", "remaining_time": "4:10:26"} +{"current_steps": 370, "total_steps": 1044, "loss": 0.04919912219047547, "lr": 8.17317856706482e-05, "epoch": 1.0633891590131461, "percentage": 35.44, "elapsed_time": "2:16:28", "remaining_time": "4:08:35"} +{"current_steps": 375, "total_steps": 1044, "loss": 0.04963598847389221, "lr": 8.108098126190129e-05, "epoch": 1.0777957860615883, "percentage": 35.92, "elapsed_time": "2:18:17", "remaining_time": "4:06:43"} +{"current_steps": 380, "total_steps": 1044, "loss": 0.046415746212005615, "lr": 8.042147937867079e-05, "epoch": 1.0922024131100305, "percentage": 36.4, "elapsed_time": "2:20:08", "remaining_time": "4:04:53"} +{"current_steps": 385, "total_steps": 1044, "loss": 0.04439312219619751, "lr": 7.975346457114034e-05, "epoch": 1.106609040158473, "percentage": 36.88, "elapsed_time": "2:21:58", "remaining_time": "4:03:01"} +{"current_steps": 390, "total_steps": 1044, "loss": 0.051634716987609866, "lr": 7.907712377168817e-05, "epoch": 1.1210156672069151, "percentage": 37.36, "elapsed_time": "2:23:50", "remaining_time": "4:01:11"} +{"current_steps": 395, "total_steps": 1044, "loss": 0.04415662288665771, "lr": 7.839264624257712e-05, "epoch": 1.1354222942553576, "percentage": 37.84, "elapsed_time": "2:25:37", "remaining_time": "3:59:16"} +{"current_steps": 400, "total_steps": 1044, "loss": 0.047378170490264895, "lr": 7.770022352299293e-05, "epoch": 1.1498289213037998, "percentage": 38.31, "elapsed_time": "2:27:28", "remaining_time": "3:57:26"} +{"current_steps": 405, "total_steps": 1044, "loss": 0.04249417781829834, "lr": 7.700004937544542e-05, "epoch": 1.164235548352242, "percentage": 38.79, "elapsed_time": "2:29:19", "remaining_time": "3:55:35"} +{"current_steps": 410, "total_steps": 1044, "loss": 0.04593285918235779, "lr": 7.629231973154725e-05, "epoch": 1.1786421754006844, "percentage": 39.27, "elapsed_time": "2:31:08", "remaining_time": "3:53:43"} +{"current_steps": 415, "total_steps": 1044, "loss": 0.05370241403579712, "lr": 7.557723263718596e-05, "epoch": 1.1930488024491266, "percentage": 39.75, "elapsed_time": "2:33:19", "remaining_time": "3:52:24"} +{"current_steps": 420, "total_steps": 1044, "loss": 0.04612640142440796, "lr": 7.485498819710417e-05, "epoch": 1.2074554294975688, "percentage": 40.23, "elapsed_time": "2:35:10", "remaining_time": "3:50:32"} +{"current_steps": 425, "total_steps": 1044, "loss": 0.043773263692855835, "lr": 7.412578851890384e-05, "epoch": 1.2218620565460112, "percentage": 40.71, "elapsed_time": "2:37:00", "remaining_time": "3:48:40"} +{"current_steps": 430, "total_steps": 1044, "loss": 0.046638333797454835, "lr": 7.338983765648985e-05, "epoch": 1.2362686835944534, "percentage": 41.19, "elapsed_time": "2:38:51", "remaining_time": "3:46:50"} +{"current_steps": 435, "total_steps": 1044, "loss": 0.045640939474105836, "lr": 7.264734155296912e-05, "epoch": 1.2506753106428956, "percentage": 41.67, "elapsed_time": "2:40:40", "remaining_time": "3:44:57"} +{"current_steps": 440, "total_steps": 1044, "loss": 0.04710923135280609, "lr": 7.189850798302099e-05, "epoch": 1.265081937691338, "percentage": 42.15, "elapsed_time": "2:42:31", "remaining_time": "3:43:05"} +{"current_steps": 445, "total_steps": 1044, "loss": 0.04437531530857086, "lr": 7.114354649475499e-05, "epoch": 1.2794885647397802, "percentage": 42.62, "elapsed_time": "2:44:21", "remaining_time": "3:41:14"} +{"current_steps": 450, "total_steps": 1044, "loss": 0.04155453443527222, "lr": 7.038266835107257e-05, "epoch": 1.2938951917882227, "percentage": 43.1, "elapsed_time": "2:46:12", "remaining_time": "3:39:23"} +{"current_steps": 455, "total_steps": 1044, "loss": 0.04477185308933258, "lr": 6.961608647054873e-05, "epoch": 1.3083018188366649, "percentage": 43.58, "elapsed_time": "2:48:01", "remaining_time": "3:37:30"} +{"current_steps": 460, "total_steps": 1044, "loss": 0.045587533712387086, "lr": 6.884401536785045e-05, "epoch": 1.322708445885107, "percentage": 44.06, "elapsed_time": "2:49:52", "remaining_time": "3:35:39"} +{"current_steps": 465, "total_steps": 1044, "loss": 0.04496743679046631, "lr": 6.806667109370853e-05, "epoch": 1.3371150729335495, "percentage": 44.54, "elapsed_time": "2:51:42", "remaining_time": "3:33:47"} +{"current_steps": 470, "total_steps": 1044, "loss": 0.04124987423419953, "lr": 6.728427117445948e-05, "epoch": 1.3515216999819917, "percentage": 45.02, "elapsed_time": "2:53:32", "remaining_time": "3:31:56"} +{"current_steps": 475, "total_steps": 1044, "loss": 0.044256627559661865, "lr": 6.649703455117458e-05, "epoch": 1.365928327030434, "percentage": 45.5, "elapsed_time": "2:55:23", "remaining_time": "3:30:05"} +{"current_steps": 480, "total_steps": 1044, "loss": 0.047923988103866576, "lr": 6.5705181518393e-05, "epoch": 1.3803349540788763, "percentage": 45.98, "elapsed_time": "2:57:10", "remaining_time": "3:28:11"} +{"current_steps": 485, "total_steps": 1044, "loss": 0.040982422232627866, "lr": 6.490893366247612e-05, "epoch": 1.3947415811273185, "percentage": 46.46, "elapsed_time": "2:59:00", "remaining_time": "3:26:19"} +{"current_steps": 490, "total_steps": 1044, "loss": 0.0431306004524231, "lr": 6.41085137996006e-05, "epoch": 1.409148208175761, "percentage": 46.93, "elapsed_time": "3:00:51", "remaining_time": "3:24:28"} +{"current_steps": 495, "total_steps": 1044, "loss": 0.039784133434295654, "lr": 6.330414591340689e-05, "epoch": 1.4235548352242031, "percentage": 47.41, "elapsed_time": "3:02:41", "remaining_time": "3:22:36"} +{"current_steps": 500, "total_steps": 1044, "loss": 0.04327746033668518, "lr": 6.249605509232149e-05, "epoch": 1.4379614622726455, "percentage": 47.89, "elapsed_time": "3:04:32", "remaining_time": "3:20:46"} +{"current_steps": 505, "total_steps": 1044, "loss": 0.04065501093864441, "lr": 6.168446746656973e-05, "epoch": 1.4523680893210877, "percentage": 48.37, "elapsed_time": "3:06:22", "remaining_time": "3:18:55"} +{"current_steps": 510, "total_steps": 1044, "loss": 0.040621763467788695, "lr": 6.0869610144897215e-05, "epoch": 1.46677471636953, "percentage": 48.85, "elapsed_time": "3:08:12", "remaining_time": "3:17:04"} +{"current_steps": 515, "total_steps": 1044, "loss": 0.042708945274353025, "lr": 6.005171115101735e-05, "epoch": 1.4811813434179721, "percentage": 49.33, "elapsed_time": "3:10:03", "remaining_time": "3:15:13"} +{"current_steps": 520, "total_steps": 1044, "loss": 0.03845831751823425, "lr": 5.9230999359802784e-05, "epoch": 1.4955879704664146, "percentage": 49.81, "elapsed_time": "3:11:52", "remaining_time": "3:13:21"} +{"current_steps": 525, "total_steps": 1044, "loss": 0.04369714856147766, "lr": 5.84077044332389e-05, "epoch": 1.509994597514857, "percentage": 50.29, "elapsed_time": "3:13:43", "remaining_time": "3:11:30"} +{"current_steps": 530, "total_steps": 1044, "loss": 0.04057990908622742, "lr": 5.7582056756156665e-05, "epoch": 1.5244012245632992, "percentage": 50.77, "elapsed_time": "3:15:33", "remaining_time": "3:09:39"} +{"current_steps": 535, "total_steps": 1044, "loss": 0.03988811373710632, "lr": 5.675428737176367e-05, "epoch": 1.5388078516117414, "percentage": 51.25, "elapsed_time": "3:17:25", "remaining_time": "3:07:49"} +{"current_steps": 540, "total_steps": 1044, "loss": 0.040156081318855286, "lr": 5.5924627916990446e-05, "epoch": 1.5532144786601836, "percentage": 51.72, "elapsed_time": "3:19:16", "remaining_time": "3:05:59"} +{"current_steps": 545, "total_steps": 1044, "loss": 0.04313129186630249, "lr": 5.5093310557671074e-05, "epoch": 1.567621105708626, "percentage": 52.2, "elapsed_time": "3:21:07", "remaining_time": "3:04:08"} +{"current_steps": 550, "total_steps": 1044, "loss": 0.04041691720485687, "lr": 5.426056792357551e-05, "epoch": 1.5820277327570682, "percentage": 52.68, "elapsed_time": "3:22:57", "remaining_time": "3:02:17"} +{"current_steps": 555, "total_steps": 1044, "loss": 0.04093085825443268, "lr": 5.342663304331211e-05, "epoch": 1.5964343598055106, "percentage": 53.16, "elapsed_time": "3:24:47", "remaining_time": "3:00:26"} +{"current_steps": 560, "total_steps": 1044, "loss": 0.039686673879623414, "lr": 5.25917392791188e-05, "epoch": 1.6108409868539528, "percentage": 53.64, "elapsed_time": "3:26:38", "remaining_time": "2:58:36"} +{"current_steps": 565, "total_steps": 1044, "loss": 0.039973828196525577, "lr": 5.1756120261560446e-05, "epoch": 1.625247613902395, "percentage": 54.12, "elapsed_time": "3:28:29", "remaining_time": "2:56:45"} +{"current_steps": 570, "total_steps": 1044, "loss": 0.03885244131088257, "lr": 5.092000982415162e-05, "epoch": 1.6396542409508372, "percentage": 54.6, "elapsed_time": "3:30:19", "remaining_time": "2:54:54"} +{"current_steps": 575, "total_steps": 1044, "loss": 0.03913732171058655, "lr": 5.0083641937922145e-05, "epoch": 1.6540608679992796, "percentage": 55.08, "elapsed_time": "3:32:10", "remaining_time": "2:53:03"} +{"current_steps": 580, "total_steps": 1044, "loss": 0.038859084248542786, "lr": 4.924725064594447e-05, "epoch": 1.668467495047722, "percentage": 55.56, "elapsed_time": "3:34:01", "remaining_time": "2:51:13"} +{"current_steps": 585, "total_steps": 1044, "loss": 0.037244629859924314, "lr": 4.8411069997840756e-05, "epoch": 1.6828741220961643, "percentage": 56.03, "elapsed_time": "3:35:52", "remaining_time": "2:49:22"} +{"current_steps": 590, "total_steps": 1044, "loss": 0.04225952625274658, "lr": 4.757533398428812e-05, "epoch": 1.6972807491446065, "percentage": 56.51, "elapsed_time": "3:37:43", "remaining_time": "2:47:31"} +{"current_steps": 595, "total_steps": 1044, "loss": 0.03874731659889221, "lr": 4.674027647154037e-05, "epoch": 1.7116873761930487, "percentage": 56.99, "elapsed_time": "3:39:33", "remaining_time": "2:45:41"} +{"current_steps": 600, "total_steps": 1044, "loss": 0.03750569224357605, "lr": 4.590613113598461e-05, "epoch": 1.726094003241491, "percentage": 57.47, "elapsed_time": "3:41:23", "remaining_time": "2:43:49"} +{"current_steps": 605, "total_steps": 1044, "loss": 0.03765683174133301, "lr": 4.507313139875102e-05, "epoch": 1.7405006302899335, "percentage": 57.95, "elapsed_time": "3:43:13", "remaining_time": "2:41:58"} +{"current_steps": 610, "total_steps": 1044, "loss": 0.03841148316860199, "lr": 4.4241510360393804e-05, "epoch": 1.7549072573383757, "percentage": 58.43, "elapsed_time": "3:45:03", "remaining_time": "2:40:07"} +{"current_steps": 615, "total_steps": 1044, "loss": 0.03978689610958099, "lr": 4.341150073566227e-05, "epoch": 1.769313884386818, "percentage": 58.91, "elapsed_time": "3:46:51", "remaining_time": "2:38:14"} +{"current_steps": 620, "total_steps": 1044, "loss": 0.038895291090011594, "lr": 4.258333478837947e-05, "epoch": 1.7837205114352601, "percentage": 59.39, "elapsed_time": "3:48:42", "remaining_time": "2:36:24"} +{"current_steps": 625, "total_steps": 1044, "loss": 0.04072596728801727, "lr": 4.1757244266447245e-05, "epoch": 1.7981271384837025, "percentage": 59.87, "elapsed_time": "3:50:52", "remaining_time": "2:34:46"} +{"current_steps": 630, "total_steps": 1044, "loss": 0.03865320086479187, "lr": 4.093346033699557e-05, "epoch": 1.8125337655321447, "percentage": 60.34, "elapsed_time": "3:52:43", "remaining_time": "2:32:56"} +{"current_steps": 635, "total_steps": 1044, "loss": 0.04185936748981476, "lr": 4.011221352169447e-05, "epoch": 1.8269403925805872, "percentage": 60.82, "elapsed_time": "3:54:36", "remaining_time": "2:31:06"} +{"current_steps": 640, "total_steps": 1044, "loss": 0.04408974051475525, "lr": 3.9293733632246544e-05, "epoch": 1.8413470196290294, "percentage": 61.3, "elapsed_time": "3:56:27", "remaining_time": "2:29:15"} +{"current_steps": 645, "total_steps": 1044, "loss": 0.04014042019844055, "lr": 3.847824970607797e-05, "epoch": 1.8557536466774716, "percentage": 61.78, "elapsed_time": "3:58:17", "remaining_time": "2:27:24"} +{"current_steps": 650, "total_steps": 1044, "loss": 0.03581300973892212, "lr": 3.7665989942246625e-05, "epoch": 1.8701602737259138, "percentage": 62.26, "elapsed_time": "4:00:08", "remaining_time": "2:25:33"} +{"current_steps": 655, "total_steps": 1044, "loss": 0.04189331531524658, "lr": 3.685718163758427e-05, "epoch": 1.8845669007743562, "percentage": 62.74, "elapsed_time": "4:01:58", "remaining_time": "2:23:42"} +{"current_steps": 660, "total_steps": 1044, "loss": 0.03912949562072754, "lr": 3.6052051123091634e-05, "epoch": 1.8989735278227986, "percentage": 63.22, "elapsed_time": "4:03:48", "remaining_time": "2:21:50"} +{"current_steps": 665, "total_steps": 1044, "loss": 0.03808005452156067, "lr": 3.5250823700603496e-05, "epoch": 1.9133801548712408, "percentage": 63.7, "elapsed_time": "4:05:38", "remaining_time": "2:19:59"} +{"current_steps": 670, "total_steps": 1044, "loss": 0.03524368405342102, "lr": 3.445372357974194e-05, "epoch": 1.927786781919683, "percentage": 64.18, "elapsed_time": "4:07:27", "remaining_time": "2:18:07"} +{"current_steps": 675, "total_steps": 1044, "loss": 0.03650209903717041, "lr": 3.3660973815175165e-05, "epoch": 1.9421934089681252, "percentage": 64.66, "elapsed_time": "4:09:16", "remaining_time": "2:16:16"} +{"current_steps": 680, "total_steps": 1044, "loss": 0.036546701192855836, "lr": 3.287279624419945e-05, "epoch": 1.9566000360165676, "percentage": 65.13, "elapsed_time": "4:11:08", "remaining_time": "2:14:25"} +{"current_steps": 685, "total_steps": 1044, "loss": 0.03591431975364685, "lr": 3.208941142466187e-05, "epoch": 1.97100666306501, "percentage": 65.61, "elapsed_time": "4:12:59", "remaining_time": "2:12:35"} +{"current_steps": 690, "total_steps": 1044, "loss": 0.03485568761825562, "lr": 3.1311038573240975e-05, "epoch": 1.9854132901134522, "percentage": 66.09, "elapsed_time": "4:14:49", "remaining_time": "2:10:44"} +{"current_steps": 695, "total_steps": 1044, "loss": 0.037538421154022214, "lr": 3.0537895504102874e-05, "epoch": 1.9998199171618944, "percentage": 66.57, "elapsed_time": "4:16:38", "remaining_time": "2:08:52"} +{"current_steps": 700, "total_steps": 1044, "loss": 0.027647560834884642, "lr": 2.9770198567949546e-05, "epoch": 2.0115253016387538, "percentage": 67.05, "elapsed_time": "4:18:09", "remaining_time": "2:06:51"} +{"current_steps": 705, "total_steps": 1044, "loss": 0.03239924311637878, "lr": 2.900816259147705e-05, "epoch": 2.025931928687196, "percentage": 67.53, "elapsed_time": "4:20:02", "remaining_time": "2:05:02"} +{"current_steps": 710, "total_steps": 1044, "loss": 0.02974867820739746, "lr": 2.8252000817259837e-05, "epoch": 2.0403385557356386, "percentage": 68.01, "elapsed_time": "4:21:51", "remaining_time": "2:03:11"} +{"current_steps": 715, "total_steps": 1044, "loss": 0.027856966853141783, "lr": 2.7501924844078534e-05, "epoch": 2.054745182784081, "percentage": 68.49, "elapsed_time": "4:23:40", "remaining_time": "2:01:19"} +{"current_steps": 720, "total_steps": 1044, "loss": 0.028209209442138672, "lr": 2.6758144567707754e-05, "epoch": 2.069151809832523, "percentage": 68.97, "elapsed_time": "4:25:31", "remaining_time": "1:59:29"} +{"current_steps": 725, "total_steps": 1044, "loss": 0.02793322205543518, "lr": 2.6020868122180385e-05, "epoch": 2.083558436880965, "percentage": 69.44, "elapsed_time": "4:27:22", "remaining_time": "1:57:38"} +{"current_steps": 730, "total_steps": 1044, "loss": 0.02801375389099121, "lr": 2.5290301821544825e-05, "epoch": 2.0979650639294074, "percentage": 69.92, "elapsed_time": "4:29:12", "remaining_time": "1:55:47"} +{"current_steps": 735, "total_steps": 1044, "loss": 0.02737850546836853, "lr": 2.4566650102131573e-05, "epoch": 2.1123716909778496, "percentage": 70.4, "elapsed_time": "4:31:03", "remaining_time": "1:53:57"} +{"current_steps": 740, "total_steps": 1044, "loss": 0.030919501185417177, "lr": 2.3850115465345324e-05, "epoch": 2.1267783180262922, "percentage": 70.88, "elapsed_time": "4:32:54", "remaining_time": "1:52:06"} +{"current_steps": 745, "total_steps": 1044, "loss": 0.028718733787536622, "lr": 2.3140898420998426e-05, "epoch": 2.1411849450747344, "percentage": 71.36, "elapsed_time": "4:34:45", "remaining_time": "1:50:16"} +{"current_steps": 750, "total_steps": 1044, "loss": 0.028903046250343324, "lr": 2.2439197431201646e-05, "epoch": 2.1555915721231766, "percentage": 71.84, "elapsed_time": "4:36:37", "remaining_time": "1:48:26"} +{"current_steps": 755, "total_steps": 1044, "loss": 0.024923816323280334, "lr": 2.1745208854828058e-05, "epoch": 2.169998199171619, "percentage": 72.32, "elapsed_time": "4:38:27", "remaining_time": "1:46:35"} +{"current_steps": 760, "total_steps": 1044, "loss": 0.026013752818107604, "lr": 2.105912689256533e-05, "epoch": 2.184404826220061, "percentage": 72.8, "elapsed_time": "4:40:17", "remaining_time": "1:44:44"} +{"current_steps": 765, "total_steps": 1044, "loss": 0.026708921790122984, "lr": 2.0381143532572082e-05, "epoch": 2.1988114532685037, "percentage": 73.28, "elapsed_time": "4:42:07", "remaining_time": "1:42:53"} +{"current_steps": 770, "total_steps": 1044, "loss": 0.02909781038761139, "lr": 1.9711448496753297e-05, "epoch": 2.213218080316946, "percentage": 73.75, "elapsed_time": "4:43:57", "remaining_time": "1:41:02"} +{"current_steps": 775, "total_steps": 1044, "loss": 0.027940624952316286, "lr": 1.905022918766995e-05, "epoch": 2.227624707365388, "percentage": 74.23, "elapsed_time": "4:45:46", "remaining_time": "1:39:11"} +{"current_steps": 780, "total_steps": 1044, "loss": 0.026423072814941405, "lr": 1.8397670636097636e-05, "epoch": 2.2420313344138303, "percentage": 74.71, "elapsed_time": "4:47:36", "remaining_time": "1:37:20"} +{"current_steps": 785, "total_steps": 1044, "loss": 0.028386065363883974, "lr": 1.775395544924885e-05, "epoch": 2.2564379614622725, "percentage": 75.19, "elapsed_time": "4:49:26", "remaining_time": "1:35:29"} +{"current_steps": 790, "total_steps": 1044, "loss": 0.02769894599914551, "lr": 1.7119263759673675e-05, "epoch": 2.270844588510715, "percentage": 75.67, "elapsed_time": "4:51:16", "remaining_time": "1:33:39"} +{"current_steps": 795, "total_steps": 1044, "loss": 0.02839537858963013, "lr": 1.6493773174852673e-05, "epoch": 2.2852512155591573, "percentage": 76.15, "elapsed_time": "4:53:07", "remaining_time": "1:31:48"} +{"current_steps": 800, "total_steps": 1044, "loss": 0.02569463849067688, "lr": 1.587765872749649e-05, "epoch": 2.2996578426075995, "percentage": 76.63, "elapsed_time": "4:54:56", "remaining_time": "1:29:57"} +{"current_steps": 805, "total_steps": 1044, "loss": 0.028371796011924744, "lr": 1.527109282656611e-05, "epoch": 2.3140644696560417, "percentage": 77.11, "elapsed_time": "4:56:45", "remaining_time": "1:28:06"} +{"current_steps": 810, "total_steps": 1044, "loss": 0.026229003071784975, "lr": 1.4674245209027066e-05, "epoch": 2.328471096704484, "percentage": 77.59, "elapsed_time": "4:58:36", "remaining_time": "1:26:15"} +{"current_steps": 815, "total_steps": 1044, "loss": 0.029995208978652953, "lr": 1.4087282892351623e-05, "epoch": 2.3428777237529266, "percentage": 78.07, "elapsed_time": "5:00:26", "remaining_time": "1:24:25"} +{"current_steps": 820, "total_steps": 1044, "loss": 0.029001206159591675, "lr": 1.3510370127781635e-05, "epoch": 2.3572843508013688, "percentage": 78.54, "elapsed_time": "5:02:17", "remaining_time": "1:22:34"} +{"current_steps": 825, "total_steps": 1044, "loss": 0.02766028940677643, "lr": 1.2943668354365878e-05, "epoch": 2.371690977849811, "percentage": 79.02, "elapsed_time": "5:04:07", "remaining_time": "1:20:43"} +{"current_steps": 830, "total_steps": 1044, "loss": 0.02593517005443573, "lr": 1.2387336153784018e-05, "epoch": 2.386097604898253, "percentage": 79.5, "elapsed_time": "5:06:18", "remaining_time": "1:18:58"} +{"current_steps": 835, "total_steps": 1044, "loss": 0.026943469047546388, "lr": 1.184152920597028e-05, "epoch": 2.4005042319466954, "percentage": 79.98, "elapsed_time": "5:08:09", "remaining_time": "1:17:07"} +{"current_steps": 840, "total_steps": 1044, "loss": 0.024954386055469513, "lr": 1.1306400245549158e-05, "epoch": 2.4149108589951376, "percentage": 80.46, "elapsed_time": "5:09:59", "remaining_time": "1:15:16"} +{"current_steps": 845, "total_steps": 1044, "loss": 0.028272977471351622, "lr": 1.0782099019095238e-05, "epoch": 2.42931748604358, "percentage": 80.94, "elapsed_time": "5:11:50", "remaining_time": "1:13:26"} +{"current_steps": 850, "total_steps": 1044, "loss": 0.02370927333831787, "lr": 1.026877224322923e-05, "epoch": 2.4437241130920224, "percentage": 81.42, "elapsed_time": "5:13:40", "remaining_time": "1:11:35"} +{"current_steps": 855, "total_steps": 1044, "loss": 0.025498074293136597, "lr": 9.766563563561799e-06, "epoch": 2.4581307401404646, "percentage": 81.9, "elapsed_time": "5:15:31", "remaining_time": "1:09:44"} +{"current_steps": 860, "total_steps": 1044, "loss": 0.02770912051200867, "lr": 9.275613514496977e-06, "epoch": 2.472537367188907, "percentage": 82.38, "elapsed_time": "5:17:22", "remaining_time": "1:07:54"} +{"current_steps": 865, "total_steps": 1044, "loss": 0.027615338563919067, "lr": 8.7960594799059e-06, "epoch": 2.486943994237349, "percentage": 82.85, "elapsed_time": "5:19:14", "remaining_time": "1:06:03"} +{"current_steps": 870, "total_steps": 1044, "loss": 0.027428582310676575, "lr": 8.328035654682325e-06, "epoch": 2.501350621285791, "percentage": 83.33, "elapsed_time": "5:21:04", "remaining_time": "1:04:12"} +{"current_steps": 875, "total_steps": 1044, "loss": 0.026888126134872438, "lr": 7.871673007190599e-06, "epoch": 2.515757248334234, "percentage": 83.81, "elapsed_time": "5:22:54", "remaining_time": "1:02:22"} +{"current_steps": 880, "total_steps": 1044, "loss": 0.025411182641983034, "lr": 7.427099242616348e-06, "epoch": 2.530163875382676, "percentage": 84.29, "elapsed_time": "5:24:43", "remaining_time": "1:00:31"} +{"current_steps": 885, "total_steps": 1044, "loss": 0.024811127781867982, "lr": 6.994438767230466e-06, "epoch": 2.5445705024311183, "percentage": 84.77, "elapsed_time": "5:26:34", "remaining_time": "0:58:40"} +{"current_steps": 890, "total_steps": 1044, "loss": 0.02613699436187744, "lr": 6.573812653576062e-06, "epoch": 2.5589771294795605, "percentage": 85.25, "elapsed_time": "5:28:25", "remaining_time": "0:56:49"} +{"current_steps": 895, "total_steps": 1044, "loss": 0.026964515447616577, "lr": 6.1653386065885165e-06, "epoch": 2.5733837565280027, "percentage": 85.73, "elapsed_time": "5:30:15", "remaining_time": "0:54:58"} +{"current_steps": 900, "total_steps": 1044, "loss": 0.028112486004829407, "lr": 5.769130930657734e-06, "epoch": 2.5877903835764453, "percentage": 86.21, "elapsed_time": "5:32:07", "remaining_time": "0:53:08"} +{"current_steps": 905, "total_steps": 1044, "loss": 0.02626214623451233, "lr": 5.38530049764206e-06, "epoch": 2.6021970106248875, "percentage": 86.69, "elapsed_time": "5:33:57", "remaining_time": "0:51:17"} +{"current_steps": 910, "total_steps": 1044, "loss": 0.02669944763183594, "lr": 5.0139547158427e-06, "epoch": 2.6166036376733297, "percentage": 87.16, "elapsed_time": "5:35:45", "remaining_time": "0:49:26"} +{"current_steps": 915, "total_steps": 1044, "loss": 0.029006192088127138, "lr": 4.655197499947378e-06, "epoch": 2.631010264721772, "percentage": 87.64, "elapsed_time": "5:37:35", "remaining_time": "0:47:35"} +{"current_steps": 920, "total_steps": 1044, "loss": 0.02491077184677124, "lr": 4.309129241951587e-06, "epoch": 2.645416891770214, "percentage": 88.12, "elapsed_time": "5:39:25", "remaining_time": "0:45:44"} +{"current_steps": 925, "total_steps": 1044, "loss": 0.026326572895050047, "lr": 3.975846783065662e-06, "epoch": 2.6598235188186568, "percentage": 88.6, "elapsed_time": "5:41:14", "remaining_time": "0:43:54"} +{"current_steps": 930, "total_steps": 1044, "loss": 0.026823589205741884, "lr": 3.6554433866154036e-06, "epoch": 2.674230145867099, "percentage": 89.08, "elapsed_time": "5:43:04", "remaining_time": "0:42:03"} +{"current_steps": 935, "total_steps": 1044, "loss": 0.025913709402084352, "lr": 3.3480087119440063e-06, "epoch": 2.688636772915541, "percentage": 89.56, "elapsed_time": "5:44:55", "remaining_time": "0:40:12"} +{"current_steps": 940, "total_steps": 1044, "loss": 0.026928871870040894, "lr": 3.0536287893223604e-06, "epoch": 2.7030433999639834, "percentage": 90.04, "elapsed_time": "5:46:45", "remaining_time": "0:38:21"} +{"current_steps": 945, "total_steps": 1044, "loss": 0.02748822569847107, "lr": 2.7723859958750486e-06, "epoch": 2.7174500270124256, "percentage": 90.52, "elapsed_time": "5:48:35", "remaining_time": "0:36:31"} +{"current_steps": 950, "total_steps": 1044, "loss": 0.025952500104904175, "lr": 2.5043590325285195e-06, "epoch": 2.731856654060868, "percentage": 91.0, "elapsed_time": "5:50:27", "remaining_time": "0:34:40"} +{"current_steps": 955, "total_steps": 1044, "loss": 0.02589995265007019, "lr": 2.249622901987963e-06, "epoch": 2.7462632811093104, "percentage": 91.48, "elapsed_time": "5:52:17", "remaining_time": "0:32:49"} +{"current_steps": 960, "total_steps": 1044, "loss": 0.027577921748161316, "lr": 2.0082488877491033e-06, "epoch": 2.7606699081577526, "percentage": 91.95, "elapsed_time": "5:54:07", "remaining_time": "0:30:59"} +{"current_steps": 965, "total_steps": 1044, "loss": 0.025488072633743288, "lr": 1.7803045341507952e-06, "epoch": 2.775076535206195, "percentage": 92.43, "elapsed_time": "5:55:57", "remaining_time": "0:29:08"} +{"current_steps": 970, "total_steps": 1044, "loss": 0.02348570078611374, "lr": 1.5658536274738621e-06, "epoch": 2.789483162254637, "percentage": 92.91, "elapsed_time": "5:57:49", "remaining_time": "0:27:17"} +{"current_steps": 975, "total_steps": 1044, "loss": 0.02316732406616211, "lr": 1.3649561780916199e-06, "epoch": 2.8038897893030796, "percentage": 93.39, "elapsed_time": "5:59:37", "remaining_time": "0:25:27"} +{"current_steps": 980, "total_steps": 1044, "loss": 0.02901957035064697, "lr": 1.1776684036770347e-06, "epoch": 2.818296416351522, "percentage": 93.87, "elapsed_time": "6:01:27", "remaining_time": "0:23:36"} +{"current_steps": 985, "total_steps": 1044, "loss": 0.02710677683353424, "lr": 1.004042713471165e-06, "epoch": 2.832703043399964, "percentage": 94.35, "elapsed_time": "6:03:18", "remaining_time": "0:21:45"} +{"current_steps": 990, "total_steps": 1044, "loss": 0.024537976086139678, "lr": 8.441276936173193e-07, "epoch": 2.8471096704484062, "percentage": 94.83, "elapsed_time": "6:05:09", "remaining_time": "0:19:55"} +{"current_steps": 995, "total_steps": 1044, "loss": 0.025470972061157227, "lr": 6.9796809356511e-07, "epoch": 2.8615162974968484, "percentage": 95.31, "elapsed_time": "6:06:58", "remaining_time": "0:18:04"} +{"current_steps": 1000, "total_steps": 1044, "loss": 0.025230163335800172, "lr": 5.656048135480763e-07, "epoch": 2.875922924545291, "percentage": 95.79, "elapsed_time": "6:08:49", "remaining_time": "0:16:13"} +{"current_steps": 1005, "total_steps": 1044, "loss": 0.026770299673080443, "lr": 4.470748931384494e-07, "epoch": 2.8903295515937333, "percentage": 96.26, "elapsed_time": "6:10:40", "remaining_time": "0:14:23"} +{"current_steps": 1010, "total_steps": 1044, "loss": 0.026645660400390625, "lr": 3.424115008822726e-07, "epoch": 2.9047361786421755, "percentage": 96.74, "elapsed_time": "6:12:30", "remaining_time": "0:12:32"} +{"current_steps": 1015, "total_steps": 1044, "loss": 0.025820019841194152, "lr": 2.5164392501777487e-07, "epoch": 2.9191428056906177, "percentage": 97.22, "elapsed_time": "6:14:20", "remaining_time": "0:10:41"} +{"current_steps": 1020, "total_steps": 1044, "loss": 0.025720816850662232, "lr": 1.7479756527955527e-07, "epoch": 2.93354943273906, "percentage": 97.7, "elapsed_time": "6:16:12", "remaining_time": "0:08:51"} +{"current_steps": 1025, "total_steps": 1044, "loss": 0.024733534455299376, "lr": 1.1189392579090129e-07, "epoch": 2.9479560597875025, "percentage": 98.18, "elapsed_time": "6:18:03", "remaining_time": "0:07:00"} +{"current_steps": 1030, "total_steps": 1044, "loss": 0.02832019031047821, "lr": 6.295060904623617e-08, "epoch": 2.9623626868359443, "percentage": 98.66, "elapsed_time": "6:19:53", "remaining_time": "0:05:09"} +{"current_steps": 1035, "total_steps": 1044, "loss": 0.025465887784957886, "lr": 2.7981310985369935e-08, "epoch": 2.976769313884387, "percentage": 99.14, "elapsed_time": "6:21:43", "remaining_time": "0:03:19"} +{"current_steps": 1040, "total_steps": 1044, "loss": 0.0264853298664093, "lr": 6.995817160920792e-09, "epoch": 2.991175940932829, "percentage": 99.62, "elapsed_time": "6:23:53", "remaining_time": "0:01:28"} +{"current_steps": 1044, "total_steps": 1044, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "6:25:22", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..d677f1b --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1499 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1044, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014406627048442283, + "grad_norm": 2.235235207512626, + "learning_rate": 3.8095238095238102e-06, + "loss": 0.7567409992218017, + "step": 5 + }, + { + "epoch": 0.028813254096884566, + "grad_norm": 3.1220907369972015, + "learning_rate": 8.571428571428573e-06, + "loss": 0.6131507873535156, + "step": 10 + }, + { + "epoch": 0.04321988114532685, + "grad_norm": 1.26071593847269, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.4174152374267578, + "step": 15 + }, + { + "epoch": 0.05762650819376913, + "grad_norm": 0.7001329999687771, + "learning_rate": 1.8095238095238094e-05, + "loss": 0.3269367218017578, + "step": 20 + }, + { + "epoch": 0.07203313524221142, + "grad_norm": 0.45760881472851334, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.2579814910888672, + "step": 25 + }, + { + "epoch": 0.0864397622906537, + "grad_norm": 0.3195758164531423, + "learning_rate": 2.7619047619047622e-05, + "loss": 0.22403466701507568, + "step": 30 + }, + { + "epoch": 0.10084638933909598, + "grad_norm": 0.3937202043873978, + "learning_rate": 3.2380952380952386e-05, + "loss": 0.19430849552154542, + "step": 35 + }, + { + "epoch": 0.11525301638753827, + "grad_norm": 0.4622992203768602, + "learning_rate": 3.7142857142857143e-05, + "loss": 0.17950987815856934, + "step": 40 + }, + { + "epoch": 0.12965964343598055, + "grad_norm": 0.6153699707363115, + "learning_rate": 4.190476190476191e-05, + "loss": 0.16267883777618408, + "step": 45 + }, + { + "epoch": 0.14406627048442283, + "grad_norm": 0.44308873981945024, + "learning_rate": 4.666666666666667e-05, + "loss": 0.15465893745422363, + "step": 50 + }, + { + "epoch": 0.15847289753286511, + "grad_norm": 0.2667361601014973, + "learning_rate": 5.142857142857143e-05, + "loss": 0.1339721441268921, + "step": 55 + }, + { + "epoch": 0.1728795245813074, + "grad_norm": 0.2856336328456399, + "learning_rate": 5.619047619047619e-05, + "loss": 0.12487690448760987, + "step": 60 + }, + { + "epoch": 0.18728615162974968, + "grad_norm": 0.25342260570898284, + "learning_rate": 6.0952380952380964e-05, + "loss": 0.12374393939971924, + "step": 65 + }, + { + "epoch": 0.20169277867819196, + "grad_norm": 0.30856178034874804, + "learning_rate": 6.571428571428571e-05, + "loss": 0.11958651542663574, + "step": 70 + }, + { + "epoch": 0.21609940572663425, + "grad_norm": 0.37526519775047024, + "learning_rate": 7.047619047619048e-05, + "loss": 0.1187552571296692, + "step": 75 + }, + { + "epoch": 0.23050603277507653, + "grad_norm": 0.23950000695755433, + "learning_rate": 7.523809523809524e-05, + "loss": 0.11327266693115234, + "step": 80 + }, + { + "epoch": 0.2449126598235188, + "grad_norm": 0.2205068195876155, + "learning_rate": 8e-05, + "loss": 0.10409235954284668, + "step": 85 + }, + { + "epoch": 0.2593192868719611, + "grad_norm": 0.38702416724389993, + "learning_rate": 8.476190476190477e-05, + "loss": 0.10459071397781372, + "step": 90 + }, + { + "epoch": 0.2737259139204034, + "grad_norm": 1.5049886602582672, + "learning_rate": 8.952380952380953e-05, + "loss": 0.10260200500488281, + "step": 95 + }, + { + "epoch": 0.28813254096884566, + "grad_norm": 0.7639386034441172, + "learning_rate": 9.428571428571429e-05, + "loss": 0.15102967023849487, + "step": 100 + }, + { + "epoch": 0.302539168017288, + "grad_norm": 1.2918337466488035, + "learning_rate": 9.904761904761905e-05, + "loss": 0.21126885414123536, + "step": 105 + }, + { + "epoch": 0.31694579506573023, + "grad_norm": 0.3993035259049567, + "learning_rate": 9.99955226394288e-05, + "loss": 0.1489308714866638, + "step": 110 + }, + { + "epoch": 0.33135242211417254, + "grad_norm": 0.4337702538884852, + "learning_rate": 9.997733473639876e-05, + "loss": 0.1149595022201538, + "step": 115 + }, + { + "epoch": 0.3457590491626148, + "grad_norm": 0.19718286454816564, + "learning_rate": 9.994516154152849e-05, + "loss": 0.10596739053726197, + "step": 120 + }, + { + "epoch": 0.3601656762110571, + "grad_norm": 0.20133933569463766, + "learning_rate": 9.989901205792952e-05, + "loss": 0.09920316338539123, + "step": 125 + }, + { + "epoch": 0.37457230325949936, + "grad_norm": 0.1627871886908846, + "learning_rate": 9.983889919973586e-05, + "loss": 0.08917503952980041, + "step": 130 + }, + { + "epoch": 0.3889789303079417, + "grad_norm": 0.1732609558706655, + "learning_rate": 9.976483978849007e-05, + "loss": 0.08957574367523194, + "step": 135 + }, + { + "epoch": 0.40338555735638393, + "grad_norm": 0.17068559939024425, + "learning_rate": 9.967685454843618e-05, + "loss": 0.08561774492263793, + "step": 140 + }, + { + "epoch": 0.41779218440482624, + "grad_norm": 0.1627969165227149, + "learning_rate": 9.957496810072027e-05, + "loss": 0.083004629611969, + "step": 145 + }, + { + "epoch": 0.4321988114532685, + "grad_norm": 0.1467656284113813, + "learning_rate": 9.945920895650071e-05, + "loss": 0.0808147668838501, + "step": 150 + }, + { + "epoch": 0.4466054385017108, + "grad_norm": 0.13322475158297778, + "learning_rate": 9.932960950896981e-05, + "loss": 0.07847496271133422, + "step": 155 + }, + { + "epoch": 0.46101206555015306, + "grad_norm": 0.12265959792803287, + "learning_rate": 9.918620602428915e-05, + "loss": 0.07879123687744141, + "step": 160 + }, + { + "epoch": 0.47541869259859537, + "grad_norm": 0.11616840330675544, + "learning_rate": 9.902903863144107e-05, + "loss": 0.07581273913383484, + "step": 165 + }, + { + "epoch": 0.4898253196470376, + "grad_norm": 0.12203330893770489, + "learning_rate": 9.885815131099934e-05, + "loss": 0.07368603944778443, + "step": 170 + }, + { + "epoch": 0.5042319466954799, + "grad_norm": 0.1546308315564488, + "learning_rate": 9.867359188282192e-05, + "loss": 0.06976621150970459, + "step": 175 + }, + { + "epoch": 0.5186385737439222, + "grad_norm": 0.2238419658216969, + "learning_rate": 9.847541199266941e-05, + "loss": 0.07270271778106689, + "step": 180 + }, + { + "epoch": 0.5330452007923645, + "grad_norm": 0.1778940838233776, + "learning_rate": 9.826366709775286e-05, + "loss": 0.06899308562278747, + "step": 185 + }, + { + "epoch": 0.5474518278408068, + "grad_norm": 0.1236118996782577, + "learning_rate": 9.803841645121504e-05, + "loss": 0.06641653776168824, + "step": 190 + }, + { + "epoch": 0.561858454889249, + "grad_norm": 0.20583115161000629, + "learning_rate": 9.779972308554952e-05, + "loss": 0.06647136211395263, + "step": 195 + }, + { + "epoch": 0.5762650819376913, + "grad_norm": 0.13075349423709637, + "learning_rate": 9.754765379496202e-05, + "loss": 0.06856078505516053, + "step": 200 + }, + { + "epoch": 0.5906717089861336, + "grad_norm": 0.17339943774041067, + "learning_rate": 9.728227911667934e-05, + "loss": 0.06724534034729004, + "step": 205 + }, + { + "epoch": 0.605078336034576, + "grad_norm": 0.14182602625367816, + "learning_rate": 9.700367331121054e-05, + "loss": 0.06738802194595336, + "step": 210 + }, + { + "epoch": 0.6194849630830181, + "grad_norm": 0.16393619983216387, + "learning_rate": 9.67119143415667e-05, + "loss": 0.07073606252670288, + "step": 215 + }, + { + "epoch": 0.6338915901314605, + "grad_norm": 0.1328841830481596, + "learning_rate": 9.640708385144403e-05, + "loss": 0.06382153034210206, + "step": 220 + }, + { + "epoch": 0.6482982171799028, + "grad_norm": 0.13571266215544603, + "learning_rate": 9.608926714237754e-05, + "loss": 0.06776301860809326, + "step": 225 + }, + { + "epoch": 0.6627048442283451, + "grad_norm": 0.12351937744356929, + "learning_rate": 9.575855314987068e-05, + "loss": 0.06309446096420288, + "step": 230 + }, + { + "epoch": 0.6771114712767873, + "grad_norm": 0.12290269065618897, + "learning_rate": 9.541503441850843e-05, + "loss": 0.06422497630119324, + "step": 235 + }, + { + "epoch": 0.6915180983252296, + "grad_norm": 0.12486344744276894, + "learning_rate": 9.505880707606024e-05, + "loss": 0.06324135661125183, + "step": 240 + }, + { + "epoch": 0.7059247253736719, + "grad_norm": 0.1371721876913286, + "learning_rate": 9.468997080658031e-05, + "loss": 0.06205494403839111, + "step": 245 + }, + { + "epoch": 0.7203313524221142, + "grad_norm": 0.12440196659006258, + "learning_rate": 9.430862882251278e-05, + "loss": 0.057729125022888184, + "step": 250 + }, + { + "epoch": 0.7347379794705564, + "grad_norm": 0.11085006544539791, + "learning_rate": 9.391488783580955e-05, + "loss": 0.059876751899719236, + "step": 255 + }, + { + "epoch": 0.7491446065189987, + "grad_norm": 0.11611256342361528, + "learning_rate": 9.350885802806863e-05, + "loss": 0.05882802605628967, + "step": 260 + }, + { + "epoch": 0.763551233567441, + "grad_norm": 0.1279259846460798, + "learning_rate": 9.309065301970193e-05, + "loss": 0.06077917814254761, + "step": 265 + }, + { + "epoch": 0.7779578606158833, + "grad_norm": 0.11105876561542377, + "learning_rate": 9.266038983814039e-05, + "loss": 0.05303559303283691, + "step": 270 + }, + { + "epoch": 0.7923644876643255, + "grad_norm": 0.11671310410423168, + "learning_rate": 9.221818888508602e-05, + "loss": 0.06124954223632813, + "step": 275 + }, + { + "epoch": 0.8067711147127679, + "grad_norm": 0.11537085211038406, + "learning_rate": 9.176417390281944e-05, + "loss": 0.055888807773590087, + "step": 280 + }, + { + "epoch": 0.8211777417612102, + "grad_norm": 0.1480823536245831, + "learning_rate": 9.129847193957282e-05, + "loss": 0.056972581148147586, + "step": 285 + }, + { + "epoch": 0.8355843688096525, + "grad_norm": 0.15744268133880865, + "learning_rate": 9.08212133139776e-05, + "loss": 0.05824898481369019, + "step": 290 + }, + { + "epoch": 0.8499909958580947, + "grad_norm": 0.1397867333597395, + "learning_rate": 9.033253157859714e-05, + "loss": 0.05415785312652588, + "step": 295 + }, + { + "epoch": 0.864397622906537, + "grad_norm": 0.12034022108734013, + "learning_rate": 8.983256348255423e-05, + "loss": 0.05467197895050049, + "step": 300 + }, + { + "epoch": 0.8788042499549793, + "grad_norm": 0.12682573622924756, + "learning_rate": 8.932144893326432e-05, + "loss": 0.06181464791297912, + "step": 305 + }, + { + "epoch": 0.8932108770034216, + "grad_norm": 0.11321366531316682, + "learning_rate": 8.879933095728485e-05, + "loss": 0.05511963367462158, + "step": 310 + }, + { + "epoch": 0.9076175040518638, + "grad_norm": 0.1076394497380973, + "learning_rate": 8.826635566029166e-05, + "loss": 0.05229709148406982, + "step": 315 + }, + { + "epoch": 0.9220241311003061, + "grad_norm": 0.11249447920151531, + "learning_rate": 8.772267218619388e-05, + "loss": 0.05275582075119019, + "step": 320 + }, + { + "epoch": 0.9364307581487484, + "grad_norm": 0.11401150417345533, + "learning_rate": 8.716843267539869e-05, + "loss": 0.05470834374427795, + "step": 325 + }, + { + "epoch": 0.9508373851971907, + "grad_norm": 0.13321527980254963, + "learning_rate": 8.660379222223727e-05, + "loss": 0.05563476085662842, + "step": 330 + }, + { + "epoch": 0.9652440122456329, + "grad_norm": 0.10771804020098895, + "learning_rate": 8.602890883156454e-05, + "loss": 0.054843342304229735, + "step": 335 + }, + { + "epoch": 0.9796506392940753, + "grad_norm": 0.12601833333178913, + "learning_rate": 8.544394337454409e-05, + "loss": 0.05721263885498047, + "step": 340 + }, + { + "epoch": 0.9940572663425176, + "grad_norm": 0.12322820499048608, + "learning_rate": 8.484905954363123e-05, + "loss": 0.05096786618232727, + "step": 345 + }, + { + "epoch": 1.0057626508193769, + "grad_norm": 0.14089468629356533, + "learning_rate": 8.424442380676647e-05, + "loss": 0.05167339444160461, + "step": 350 + }, + { + "epoch": 1.0201692778678193, + "grad_norm": 0.1705872004915626, + "learning_rate": 8.363020536079239e-05, + "loss": 0.05249757170677185, + "step": 355 + }, + { + "epoch": 1.0345759049162615, + "grad_norm": 0.15358181481824462, + "learning_rate": 8.300657608410678e-05, + "loss": 0.05038872957229614, + "step": 360 + }, + { + "epoch": 1.0489825319647037, + "grad_norm": 0.13895400680332037, + "learning_rate": 8.237371048856546e-05, + "loss": 0.050058400630950926, + "step": 365 + }, + { + "epoch": 1.0633891590131461, + "grad_norm": 0.09560889181658183, + "learning_rate": 8.17317856706482e-05, + "loss": 0.04919912219047547, + "step": 370 + }, + { + "epoch": 1.0777957860615883, + "grad_norm": 0.10951811698505555, + "learning_rate": 8.108098126190129e-05, + "loss": 0.04963598847389221, + "step": 375 + }, + { + "epoch": 1.0922024131100305, + "grad_norm": 0.09853927812254934, + "learning_rate": 8.042147937867079e-05, + "loss": 0.046415746212005615, + "step": 380 + }, + { + "epoch": 1.106609040158473, + "grad_norm": 0.09238299590671381, + "learning_rate": 7.975346457114034e-05, + "loss": 0.04439312219619751, + "step": 385 + }, + { + "epoch": 1.1210156672069151, + "grad_norm": 0.10940030307745394, + "learning_rate": 7.907712377168817e-05, + "loss": 0.051634716987609866, + "step": 390 + }, + { + "epoch": 1.1354222942553576, + "grad_norm": 0.09338579936215781, + "learning_rate": 7.839264624257712e-05, + "loss": 0.04415662288665771, + "step": 395 + }, + { + "epoch": 1.1498289213037998, + "grad_norm": 0.10999587309136662, + "learning_rate": 7.770022352299293e-05, + "loss": 0.047378170490264895, + "step": 400 + }, + { + "epoch": 1.164235548352242, + "grad_norm": 0.10109309983264758, + "learning_rate": 7.700004937544542e-05, + "loss": 0.04249417781829834, + "step": 405 + }, + { + "epoch": 1.1786421754006844, + "grad_norm": 0.10231496239314469, + "learning_rate": 7.629231973154725e-05, + "loss": 0.04593285918235779, + "step": 410 + }, + { + "epoch": 1.1930488024491266, + "grad_norm": 0.1000912342655061, + "learning_rate": 7.557723263718596e-05, + "loss": 0.05370241403579712, + "step": 415 + }, + { + "epoch": 1.2074554294975688, + "grad_norm": 0.08355578823714238, + "learning_rate": 7.485498819710417e-05, + "loss": 0.04612640142440796, + "step": 420 + }, + { + "epoch": 1.2218620565460112, + "grad_norm": 0.087036754767847, + "learning_rate": 7.412578851890384e-05, + "loss": 0.043773263692855835, + "step": 425 + }, + { + "epoch": 1.2362686835944534, + "grad_norm": 0.09341830589519805, + "learning_rate": 7.338983765648985e-05, + "loss": 0.046638333797454835, + "step": 430 + }, + { + "epoch": 1.2506753106428956, + "grad_norm": 0.09163918271970233, + "learning_rate": 7.264734155296912e-05, + "loss": 0.045640939474105836, + "step": 435 + }, + { + "epoch": 1.265081937691338, + "grad_norm": 0.09623135416486957, + "learning_rate": 7.189850798302099e-05, + "loss": 0.04710923135280609, + "step": 440 + }, + { + "epoch": 1.2794885647397802, + "grad_norm": 0.09010925699278292, + "learning_rate": 7.114354649475499e-05, + "loss": 0.04437531530857086, + "step": 445 + }, + { + "epoch": 1.2938951917882227, + "grad_norm": 0.09828854110045074, + "learning_rate": 7.038266835107257e-05, + "loss": 0.04155453443527222, + "step": 450 + }, + { + "epoch": 1.3083018188366649, + "grad_norm": 0.09261388252893078, + "learning_rate": 6.961608647054873e-05, + "loss": 0.04477185308933258, + "step": 455 + }, + { + "epoch": 1.322708445885107, + "grad_norm": 0.09199618999958105, + "learning_rate": 6.884401536785045e-05, + "loss": 0.045587533712387086, + "step": 460 + }, + { + "epoch": 1.3371150729335495, + "grad_norm": 0.10296954226773448, + "learning_rate": 6.806667109370853e-05, + "loss": 0.04496743679046631, + "step": 465 + }, + { + "epoch": 1.3515216999819917, + "grad_norm": 0.0991741419475408, + "learning_rate": 6.728427117445948e-05, + "loss": 0.04124987423419953, + "step": 470 + }, + { + "epoch": 1.365928327030434, + "grad_norm": 0.08767468242608127, + "learning_rate": 6.649703455117458e-05, + "loss": 0.044256627559661865, + "step": 475 + }, + { + "epoch": 1.3803349540788763, + "grad_norm": 0.08419233546507805, + "learning_rate": 6.5705181518393e-05, + "loss": 0.047923988103866576, + "step": 480 + }, + { + "epoch": 1.3947415811273185, + "grad_norm": 0.15529323580619178, + "learning_rate": 6.490893366247612e-05, + "loss": 0.040982422232627866, + "step": 485 + }, + { + "epoch": 1.409148208175761, + "grad_norm": 0.08719252163236856, + "learning_rate": 6.41085137996006e-05, + "loss": 0.0431306004524231, + "step": 490 + }, + { + "epoch": 1.4235548352242031, + "grad_norm": 0.09381117178448978, + "learning_rate": 6.330414591340689e-05, + "loss": 0.039784133434295654, + "step": 495 + }, + { + "epoch": 1.4379614622726455, + "grad_norm": 0.08334433128110437, + "learning_rate": 6.249605509232149e-05, + "loss": 0.04327746033668518, + "step": 500 + }, + { + "epoch": 1.4523680893210877, + "grad_norm": 0.09141409005562276, + "learning_rate": 6.168446746656973e-05, + "loss": 0.04065501093864441, + "step": 505 + }, + { + "epoch": 1.46677471636953, + "grad_norm": 0.10836927533553822, + "learning_rate": 6.0869610144897215e-05, + "loss": 0.040621763467788695, + "step": 510 + }, + { + "epoch": 1.4811813434179721, + "grad_norm": 0.11429670482454558, + "learning_rate": 6.005171115101735e-05, + "loss": 0.042708945274353025, + "step": 515 + }, + { + "epoch": 1.4955879704664146, + "grad_norm": 0.10265027708777795, + "learning_rate": 5.9230999359802784e-05, + "loss": 0.03845831751823425, + "step": 520 + }, + { + "epoch": 1.509994597514857, + "grad_norm": 0.0937825232136341, + "learning_rate": 5.84077044332389e-05, + "loss": 0.04369714856147766, + "step": 525 + }, + { + "epoch": 1.5244012245632992, + "grad_norm": 0.14710934296521627, + "learning_rate": 5.7582056756156665e-05, + "loss": 0.04057990908622742, + "step": 530 + }, + { + "epoch": 1.5388078516117414, + "grad_norm": 0.08557873748617338, + "learning_rate": 5.675428737176367e-05, + "loss": 0.03988811373710632, + "step": 535 + }, + { + "epoch": 1.5532144786601836, + "grad_norm": 0.08304731519894865, + "learning_rate": 5.5924627916990446e-05, + "loss": 0.040156081318855286, + "step": 540 + }, + { + "epoch": 1.567621105708626, + "grad_norm": 0.09009100140646863, + "learning_rate": 5.5093310557671074e-05, + "loss": 0.04313129186630249, + "step": 545 + }, + { + "epoch": 1.5820277327570682, + "grad_norm": 0.09229023810015868, + "learning_rate": 5.426056792357551e-05, + "loss": 0.04041691720485687, + "step": 550 + }, + { + "epoch": 1.5964343598055106, + "grad_norm": 0.08400211717158966, + "learning_rate": 5.342663304331211e-05, + "loss": 0.04093085825443268, + "step": 555 + }, + { + "epoch": 1.6108409868539528, + "grad_norm": 0.09614326424875454, + "learning_rate": 5.25917392791188e-05, + "loss": 0.039686673879623414, + "step": 560 + }, + { + "epoch": 1.625247613902395, + "grad_norm": 0.1067845470194038, + "learning_rate": 5.1756120261560446e-05, + "loss": 0.039973828196525577, + "step": 565 + }, + { + "epoch": 1.6396542409508372, + "grad_norm": 0.08943621090417164, + "learning_rate": 5.092000982415162e-05, + "loss": 0.03885244131088257, + "step": 570 + }, + { + "epoch": 1.6540608679992796, + "grad_norm": 0.08753082979407804, + "learning_rate": 5.0083641937922145e-05, + "loss": 0.03913732171058655, + "step": 575 + }, + { + "epoch": 1.668467495047722, + "grad_norm": 0.09803669811995008, + "learning_rate": 4.924725064594447e-05, + "loss": 0.038859084248542786, + "step": 580 + }, + { + "epoch": 1.6828741220961643, + "grad_norm": 0.08541143736458823, + "learning_rate": 4.8411069997840756e-05, + "loss": 0.037244629859924314, + "step": 585 + }, + { + "epoch": 1.6972807491446065, + "grad_norm": 0.08650694144802851, + "learning_rate": 4.757533398428812e-05, + "loss": 0.04225952625274658, + "step": 590 + }, + { + "epoch": 1.7116873761930487, + "grad_norm": 0.09490787276668022, + "learning_rate": 4.674027647154037e-05, + "loss": 0.03874731659889221, + "step": 595 + }, + { + "epoch": 1.726094003241491, + "grad_norm": 0.07772058542302925, + "learning_rate": 4.590613113598461e-05, + "loss": 0.03750569224357605, + "step": 600 + }, + { + "epoch": 1.7405006302899335, + "grad_norm": 0.07856101825582532, + "learning_rate": 4.507313139875102e-05, + "loss": 0.03765683174133301, + "step": 605 + }, + { + "epoch": 1.7549072573383757, + "grad_norm": 0.07088260858693515, + "learning_rate": 4.4241510360393804e-05, + "loss": 0.03841148316860199, + "step": 610 + }, + { + "epoch": 1.769313884386818, + "grad_norm": 0.08315598782355023, + "learning_rate": 4.341150073566227e-05, + "loss": 0.03978689610958099, + "step": 615 + }, + { + "epoch": 1.7837205114352601, + "grad_norm": 0.08933153255691949, + "learning_rate": 4.258333478837947e-05, + "loss": 0.038895291090011594, + "step": 620 + }, + { + "epoch": 1.7981271384837025, + "grad_norm": 0.08396668543385523, + "learning_rate": 4.1757244266447245e-05, + "loss": 0.04072596728801727, + "step": 625 + }, + { + "epoch": 1.8125337655321447, + "grad_norm": 0.07957802106126194, + "learning_rate": 4.093346033699557e-05, + "loss": 0.03865320086479187, + "step": 630 + }, + { + "epoch": 1.8269403925805872, + "grad_norm": 0.08958406118221353, + "learning_rate": 4.011221352169447e-05, + "loss": 0.04185936748981476, + "step": 635 + }, + { + "epoch": 1.8413470196290294, + "grad_norm": 0.08961676019198377, + "learning_rate": 3.9293733632246544e-05, + "loss": 0.04408974051475525, + "step": 640 + }, + { + "epoch": 1.8557536466774716, + "grad_norm": 0.07858278806552751, + "learning_rate": 3.847824970607797e-05, + "loss": 0.04014042019844055, + "step": 645 + }, + { + "epoch": 1.8701602737259138, + "grad_norm": 0.07419667584622487, + "learning_rate": 3.7665989942246625e-05, + "loss": 0.03581300973892212, + "step": 650 + }, + { + "epoch": 1.8845669007743562, + "grad_norm": 0.08037951897237189, + "learning_rate": 3.685718163758427e-05, + "loss": 0.04189331531524658, + "step": 655 + }, + { + "epoch": 1.8989735278227986, + "grad_norm": 0.08133067284522653, + "learning_rate": 3.6052051123091634e-05, + "loss": 0.03912949562072754, + "step": 660 + }, + { + "epoch": 1.9133801548712408, + "grad_norm": 0.08974888658045152, + "learning_rate": 3.5250823700603496e-05, + "loss": 0.03808005452156067, + "step": 665 + }, + { + "epoch": 1.927786781919683, + "grad_norm": 0.07193212698550007, + "learning_rate": 3.445372357974194e-05, + "loss": 0.03524368405342102, + "step": 670 + }, + { + "epoch": 1.9421934089681252, + "grad_norm": 0.07439568567213939, + "learning_rate": 3.3660973815175165e-05, + "loss": 0.03650209903717041, + "step": 675 + }, + { + "epoch": 1.9566000360165676, + "grad_norm": 0.07586041788325688, + "learning_rate": 3.287279624419945e-05, + "loss": 0.036546701192855836, + "step": 680 + }, + { + "epoch": 1.97100666306501, + "grad_norm": 0.08294122441026296, + "learning_rate": 3.208941142466187e-05, + "loss": 0.03591431975364685, + "step": 685 + }, + { + "epoch": 1.9854132901134522, + "grad_norm": 0.08528763303850583, + "learning_rate": 3.1311038573240975e-05, + "loss": 0.03485568761825562, + "step": 690 + }, + { + "epoch": 1.9998199171618944, + "grad_norm": 0.0756456466151007, + "learning_rate": 3.0537895504102874e-05, + "loss": 0.037538421154022214, + "step": 695 + }, + { + "epoch": 2.0115253016387538, + "grad_norm": 0.0987258257656567, + "learning_rate": 2.9770198567949546e-05, + "loss": 0.027647560834884642, + "step": 700 + }, + { + "epoch": 2.025931928687196, + "grad_norm": 0.10342059226496335, + "learning_rate": 2.900816259147705e-05, + "loss": 0.03239924311637878, + "step": 705 + }, + { + "epoch": 2.0403385557356386, + "grad_norm": 0.08947622183974005, + "learning_rate": 2.8252000817259837e-05, + "loss": 0.02974867820739746, + "step": 710 + }, + { + "epoch": 2.054745182784081, + "grad_norm": 0.07819720124564082, + "learning_rate": 2.7501924844078534e-05, + "loss": 0.027856966853141783, + "step": 715 + }, + { + "epoch": 2.069151809832523, + "grad_norm": 0.07255651027166257, + "learning_rate": 2.6758144567707754e-05, + "loss": 0.028209209442138672, + "step": 720 + }, + { + "epoch": 2.083558436880965, + "grad_norm": 0.0777676865315773, + "learning_rate": 2.6020868122180385e-05, + "loss": 0.02793322205543518, + "step": 725 + }, + { + "epoch": 2.0979650639294074, + "grad_norm": 0.08664972293238134, + "learning_rate": 2.5290301821544825e-05, + "loss": 0.02801375389099121, + "step": 730 + }, + { + "epoch": 2.1123716909778496, + "grad_norm": 0.08559466896073407, + "learning_rate": 2.4566650102131573e-05, + "loss": 0.02737850546836853, + "step": 735 + }, + { + "epoch": 2.1267783180262922, + "grad_norm": 0.07852535239386964, + "learning_rate": 2.3850115465345324e-05, + "loss": 0.030919501185417177, + "step": 740 + }, + { + "epoch": 2.1411849450747344, + "grad_norm": 0.08182892636530964, + "learning_rate": 2.3140898420998426e-05, + "loss": 0.028718733787536622, + "step": 745 + }, + { + "epoch": 2.1555915721231766, + "grad_norm": 0.07295529971805709, + "learning_rate": 2.2439197431201646e-05, + "loss": 0.028903046250343324, + "step": 750 + }, + { + "epoch": 2.169998199171619, + "grad_norm": 0.07624400365106067, + "learning_rate": 2.1745208854828058e-05, + "loss": 0.024923816323280334, + "step": 755 + }, + { + "epoch": 2.184404826220061, + "grad_norm": 0.07567603422035397, + "learning_rate": 2.105912689256533e-05, + "loss": 0.026013752818107604, + "step": 760 + }, + { + "epoch": 2.1988114532685037, + "grad_norm": 0.07427613549699529, + "learning_rate": 2.0381143532572082e-05, + "loss": 0.026708921790122984, + "step": 765 + }, + { + "epoch": 2.213218080316946, + "grad_norm": 0.0721068508797536, + "learning_rate": 1.9711448496753297e-05, + "loss": 0.02909781038761139, + "step": 770 + }, + { + "epoch": 2.227624707365388, + "grad_norm": 0.09841381262275949, + "learning_rate": 1.905022918766995e-05, + "loss": 0.027940624952316286, + "step": 775 + }, + { + "epoch": 2.2420313344138303, + "grad_norm": 0.0816958462956758, + "learning_rate": 1.8397670636097636e-05, + "loss": 0.026423072814941405, + "step": 780 + }, + { + "epoch": 2.2564379614622725, + "grad_norm": 0.07936813973695164, + "learning_rate": 1.775395544924885e-05, + "loss": 0.028386065363883974, + "step": 785 + }, + { + "epoch": 2.270844588510715, + "grad_norm": 0.07710097062295308, + "learning_rate": 1.7119263759673675e-05, + "loss": 0.02769894599914551, + "step": 790 + }, + { + "epoch": 2.2852512155591573, + "grad_norm": 0.08498281330072474, + "learning_rate": 1.6493773174852673e-05, + "loss": 0.02839537858963013, + "step": 795 + }, + { + "epoch": 2.2996578426075995, + "grad_norm": 0.07674813377075432, + "learning_rate": 1.587765872749649e-05, + "loss": 0.02569463849067688, + "step": 800 + }, + { + "epoch": 2.3140644696560417, + "grad_norm": 0.06662948325098497, + "learning_rate": 1.527109282656611e-05, + "loss": 0.028371796011924744, + "step": 805 + }, + { + "epoch": 2.328471096704484, + "grad_norm": 0.08015839069477317, + "learning_rate": 1.4674245209027066e-05, + "loss": 0.026229003071784975, + "step": 810 + }, + { + "epoch": 2.3428777237529266, + "grad_norm": 0.08019588118318016, + "learning_rate": 1.4087282892351623e-05, + "loss": 0.029995208978652953, + "step": 815 + }, + { + "epoch": 2.3572843508013688, + "grad_norm": 0.08221863155956374, + "learning_rate": 1.3510370127781635e-05, + "loss": 0.029001206159591675, + "step": 820 + }, + { + "epoch": 2.371690977849811, + "grad_norm": 0.07480678399512465, + "learning_rate": 1.2943668354365878e-05, + "loss": 0.02766028940677643, + "step": 825 + }, + { + "epoch": 2.386097604898253, + "grad_norm": 0.07477452302806815, + "learning_rate": 1.2387336153784018e-05, + "loss": 0.02593517005443573, + "step": 830 + }, + { + "epoch": 2.4005042319466954, + "grad_norm": 0.07081183958851973, + "learning_rate": 1.184152920597028e-05, + "loss": 0.026943469047546388, + "step": 835 + }, + { + "epoch": 2.4149108589951376, + "grad_norm": 0.07536754957279856, + "learning_rate": 1.1306400245549158e-05, + "loss": 0.024954386055469513, + "step": 840 + }, + { + "epoch": 2.42931748604358, + "grad_norm": 0.06344152496317775, + "learning_rate": 1.0782099019095238e-05, + "loss": 0.028272977471351622, + "step": 845 + }, + { + "epoch": 2.4437241130920224, + "grad_norm": 0.0644553682371491, + "learning_rate": 1.026877224322923e-05, + "loss": 0.02370927333831787, + "step": 850 + }, + { + "epoch": 2.4581307401404646, + "grad_norm": 0.07529675849595874, + "learning_rate": 9.766563563561799e-06, + "loss": 0.025498074293136597, + "step": 855 + }, + { + "epoch": 2.472537367188907, + "grad_norm": 0.08420954265091966, + "learning_rate": 9.275613514496977e-06, + "loss": 0.02770912051200867, + "step": 860 + }, + { + "epoch": 2.486943994237349, + "grad_norm": 0.0744332415489311, + "learning_rate": 8.7960594799059e-06, + "loss": 0.027615338563919067, + "step": 865 + }, + { + "epoch": 2.501350621285791, + "grad_norm": 0.07212967627396147, + "learning_rate": 8.328035654682325e-06, + "loss": 0.027428582310676575, + "step": 870 + }, + { + "epoch": 2.515757248334234, + "grad_norm": 0.08246547759863139, + "learning_rate": 7.871673007190599e-06, + "loss": 0.026888126134872438, + "step": 875 + }, + { + "epoch": 2.530163875382676, + "grad_norm": 0.06863337011207567, + "learning_rate": 7.427099242616348e-06, + "loss": 0.025411182641983034, + "step": 880 + }, + { + "epoch": 2.5445705024311183, + "grad_norm": 0.06777467806972155, + "learning_rate": 6.994438767230466e-06, + "loss": 0.024811127781867982, + "step": 885 + }, + { + "epoch": 2.5589771294795605, + "grad_norm": 0.07029495896606512, + "learning_rate": 6.573812653576062e-06, + "loss": 0.02613699436187744, + "step": 890 + }, + { + "epoch": 2.5733837565280027, + "grad_norm": 0.07134936463967867, + "learning_rate": 6.1653386065885165e-06, + "loss": 0.026964515447616577, + "step": 895 + }, + { + "epoch": 2.5877903835764453, + "grad_norm": 0.07711841632882044, + "learning_rate": 5.769130930657734e-06, + "loss": 0.028112486004829407, + "step": 900 + }, + { + "epoch": 2.6021970106248875, + "grad_norm": 0.08360128959008864, + "learning_rate": 5.38530049764206e-06, + "loss": 0.02626214623451233, + "step": 905 + }, + { + "epoch": 2.6166036376733297, + "grad_norm": 0.07456201121764428, + "learning_rate": 5.0139547158427e-06, + "loss": 0.02669944763183594, + "step": 910 + }, + { + "epoch": 2.631010264721772, + "grad_norm": 0.07740576081667884, + "learning_rate": 4.655197499947378e-06, + "loss": 0.029006192088127138, + "step": 915 + }, + { + "epoch": 2.645416891770214, + "grad_norm": 0.06845350619031464, + "learning_rate": 4.309129241951587e-06, + "loss": 0.02491077184677124, + "step": 920 + }, + { + "epoch": 2.6598235188186568, + "grad_norm": 0.07501903308333313, + "learning_rate": 3.975846783065662e-06, + "loss": 0.026326572895050047, + "step": 925 + }, + { + "epoch": 2.674230145867099, + "grad_norm": 0.07580375293031513, + "learning_rate": 3.6554433866154036e-06, + "loss": 0.026823589205741884, + "step": 930 + }, + { + "epoch": 2.688636772915541, + "grad_norm": 0.06969116474563261, + "learning_rate": 3.3480087119440063e-06, + "loss": 0.025913709402084352, + "step": 935 + }, + { + "epoch": 2.7030433999639834, + "grad_norm": 0.0714630826160477, + "learning_rate": 3.0536287893223604e-06, + "loss": 0.026928871870040894, + "step": 940 + }, + { + "epoch": 2.7174500270124256, + "grad_norm": 0.07358152299227637, + "learning_rate": 2.7723859958750486e-06, + "loss": 0.02748822569847107, + "step": 945 + }, + { + "epoch": 2.731856654060868, + "grad_norm": 0.06838564316740577, + "learning_rate": 2.5043590325285195e-06, + "loss": 0.025952500104904175, + "step": 950 + }, + { + "epoch": 2.7462632811093104, + "grad_norm": 0.07787109185214655, + "learning_rate": 2.249622901987963e-06, + "loss": 0.02589995265007019, + "step": 955 + }, + { + "epoch": 2.7606699081577526, + "grad_norm": 0.07156945963749864, + "learning_rate": 2.0082488877491033e-06, + "loss": 0.027577921748161316, + "step": 960 + }, + { + "epoch": 2.775076535206195, + "grad_norm": 0.06514188446012159, + "learning_rate": 1.7803045341507952e-06, + "loss": 0.025488072633743288, + "step": 965 + }, + { + "epoch": 2.789483162254637, + "grad_norm": 0.0712195602884753, + "learning_rate": 1.5658536274738621e-06, + "loss": 0.02348570078611374, + "step": 970 + }, + { + "epoch": 2.8038897893030796, + "grad_norm": 0.0680133235009968, + "learning_rate": 1.3649561780916199e-06, + "loss": 0.02316732406616211, + "step": 975 + }, + { + "epoch": 2.818296416351522, + "grad_norm": 0.0824565977146897, + "learning_rate": 1.1776684036770347e-06, + "loss": 0.02901957035064697, + "step": 980 + }, + { + "epoch": 2.832703043399964, + "grad_norm": 0.08111572063117606, + "learning_rate": 1.004042713471165e-06, + "loss": 0.02710677683353424, + "step": 985 + }, + { + "epoch": 2.8471096704484062, + "grad_norm": 0.07416113908713114, + "learning_rate": 8.441276936173193e-07, + "loss": 0.024537976086139678, + "step": 990 + }, + { + "epoch": 2.8615162974968484, + "grad_norm": 0.06645937685734804, + "learning_rate": 6.9796809356511e-07, + "loss": 0.025470972061157227, + "step": 995 + }, + { + "epoch": 2.875922924545291, + "grad_norm": 0.07056688302520532, + "learning_rate": 5.656048135480763e-07, + "loss": 0.025230163335800172, + "step": 1000 + }, + { + "epoch": 2.8903295515937333, + "grad_norm": 0.07480029198072068, + "learning_rate": 4.470748931384494e-07, + "loss": 0.026770299673080443, + "step": 1005 + }, + { + "epoch": 2.9047361786421755, + "grad_norm": 0.06476290220031579, + "learning_rate": 3.424115008822726e-07, + "loss": 0.026645660400390625, + "step": 1010 + }, + { + "epoch": 2.9191428056906177, + "grad_norm": 0.07374044092567203, + "learning_rate": 2.5164392501777487e-07, + "loss": 0.025820019841194152, + "step": 1015 + }, + { + "epoch": 2.93354943273906, + "grad_norm": 0.07098709082144111, + "learning_rate": 1.7479756527955527e-07, + "loss": 0.025720816850662232, + "step": 1020 + }, + { + "epoch": 2.9479560597875025, + "grad_norm": 0.07593395611493338, + "learning_rate": 1.1189392579090129e-07, + "loss": 0.024733534455299376, + "step": 1025 + }, + { + "epoch": 2.9623626868359443, + "grad_norm": 0.07179585283776127, + "learning_rate": 6.295060904623617e-08, + "loss": 0.02832019031047821, + "step": 1030 + }, + { + "epoch": 2.976769313884387, + "grad_norm": 0.06802635060193646, + "learning_rate": 2.7981310985369935e-08, + "loss": 0.025465887784957886, + "step": 1035 + }, + { + "epoch": 2.991175940932829, + "grad_norm": 0.0759224455019542, + "learning_rate": 6.995817160920792e-09, + "loss": 0.0264853298664093, + "step": 1040 + }, + { + "epoch": 3.0, + "step": 1044, + "total_flos": 1577088536150016.0, + "train_loss": 0.06165807318099385, + "train_runtime": 23128.4215, + "train_samples_per_second": 2.881, + "train_steps_per_second": 0.045 + } + ], + "logging_steps": 5, + "max_steps": 1044, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 207, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1577088536150016.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..19bd051 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:056bbf8b34fba49a189cf35cd74634153c3dac1f62a037f81f91a5acb863863b +size 7377 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..ed8c2a6 Binary files /dev/null and b/training_loss.png differ