From 67fc8c25cd0d508ec4f877991d5c6cc81eee0018 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Thu, 7 May 2026 17:06:05 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: yuerxin/DeepSeek-R1-Distill-Qwen-1.5B Source: Original Platform --- .gitattributes | 36 ++ README.md | 59 ++ all_results.json | 9 + chat_template.jinja | 10 + config.json | 29 + generation_config.json | 9 + model.safetensors | 3 + special_tokens_map.json | 23 + tokenizer.json | 3 + tokenizer_config.json | 194 ++++++ train_results.json | 9 + trainer_state.json | 1366 +++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 1753 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..e99c1d5 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +library_name: transformers +license: mit +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +tags: +- generated_from_trainer +model-index: +- name: DeepSeek-R1-Distill-Qwen-1.5B + results: [] +--- + + + +# DeepSeek-R1-Distill-Qwen-1.5B + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on an unknown dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 4 +- eval_batch_size: 1 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 2 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 16 +- total_eval_batch_size: 2 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine_with_min_lr +- lr_scheduler_warmup_ratio: 0.03 +- num_epochs: 3 + +### Training results + + + +### Framework versions + +- Transformers 4.52.3 +- Pytorch 2.6.0+cu124 +- Datasets 4.4.2 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..57cd7dc --- /dev/null +++ b/all_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 3.0, + "total_flos": 1.847238457911296e+16, + "train_loss": 0.4201909417197818, + "train_runtime": 1415.4735, + "train_samples": 1, + "train_samples_per_second": 2.119, + "train_steps_per_second": 0.134 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..b0c610e --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,10 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{ '<|User|>' + message['content'] }}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{{ '<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + ' +' + '```json' + ' +' + tool['function']['arguments'] + ' +' + '```' + '<|tool▁call▁end|>' }}{%- set ns.is_first = true -%}{%- else %}{{ ' +' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + ' +' + '```json' + ' +' + tool['function']['arguments'] + ' +' + '```' + '<|tool▁call▁end|>' }}{{ '<|tool▁calls▁end|><|end▁of▁sentence|>' }}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{ '<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>' }}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{ '<|Assistant|>' + content + '<|end▁of▁sentence|>' }}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{ '<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>' }}{%- set ns.is_output_first = false %}{%- else %}{{ ' +<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>' }}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{ '<|tool▁outputs▁end|>' }}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{ '<|Assistant|> +' }}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..d97e3dc --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..6b5b266 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.52.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..e1ea911 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:300351799312bec6cf67bb55489243b6f3470bac6902bf3da3db4904fe346753 +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1a2db24 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 +size 11422778 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..d252dd4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..57cd7dc --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 3.0, + "total_flos": 1.847238457911296e+16, + "train_loss": 0.4201909417197818, + "train_runtime": 1415.4735, + "train_samples": 1, + "train_samples_per_second": 2.119, + "train_steps_per_second": 0.134 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..569ab2e --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1366 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 189, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 1.08417234214877, + "learning_rate": 0.0, + "loss": 0.5767, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 1.0469008257052523, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.5511, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 0.983282752070103, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.4992, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 0.872051375279908, + "learning_rate": 5e-06, + "loss": 0.4061, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.893228824528603, + "learning_rate": 6.666666666666667e-06, + "loss": 0.4181, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.9220468625068179, + "learning_rate": 8.333333333333334e-06, + "loss": 0.4488, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.7934650361234915, + "learning_rate": 1e-05, + "loss": 0.412, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.5840477320156379, + "learning_rate": 9.999336914672622e-06, + "loss": 0.417, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.505453502147165, + "learning_rate": 9.997347854104775e-06, + "loss": 0.5068, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.5244034547886003, + "learning_rate": 9.994033404481737e-06, + "loss": 0.5052, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.43986559011579357, + "learning_rate": 9.98939454258703e-06, + "loss": 0.498, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.6155944923795194, + "learning_rate": 9.98343263551454e-06, + "loss": 0.4757, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.6010488091076704, + "learning_rate": 9.97614944026565e-06, + "loss": 0.4733, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.6021634931994316, + "learning_rate": 9.967547103231432e-06, + "loss": 0.4258, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.6482124300684173, + "learning_rate": 9.957628159560088e-06, + "loss": 0.5263, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.554249479130026, + "learning_rate": 9.946395532409847e-06, + "loss": 0.4806, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.5973792849680928, + "learning_rate": 9.933852532087492e-06, + "loss": 0.4736, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.41453038423395644, + "learning_rate": 9.920002855072784e-06, + "loss": 0.4289, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.4620366674199325, + "learning_rate": 9.904850582929112e-06, + "loss": 0.4864, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.35504897921565515, + "learning_rate": 9.888400181100621e-06, + "loss": 0.4219, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.31658139671010693, + "learning_rate": 9.870656497596242e-06, + "loss": 0.3972, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.3650004342330343, + "learning_rate": 9.851624761560943e-06, + "loss": 0.5571, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.3257963763381936, + "learning_rate": 9.831310581734687e-06, + "loss": 0.4769, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.325775920332389, + "learning_rate": 9.809719944799512e-06, + "loss": 0.4961, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.31671335015470503, + "learning_rate": 9.786859213615222e-06, + "loss": 0.4357, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.3281761016809856, + "learning_rate": 9.762735125344227e-06, + "loss": 0.4502, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.3554086729723955, + "learning_rate": 9.737354789466068e-06, + "loss": 0.4768, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.3865446724293106, + "learning_rate": 9.710725685682222e-06, + "loss": 0.4515, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.37003410749958987, + "learning_rate": 9.682855661711803e-06, + "loss": 0.4432, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.31180238962223594, + "learning_rate": 9.653752930978794e-06, + "loss": 0.463, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.3252391994136198, + "learning_rate": 9.623426070191521e-06, + "loss": 0.4087, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.31056365314104256, + "learning_rate": 9.591884016815063e-06, + "loss": 0.4254, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.32735595897929803, + "learning_rate": 9.559136066437319e-06, + "loss": 0.5447, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.3100995737340131, + "learning_rate": 9.52519187002958e-06, + "loss": 0.4321, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.2936011999974949, + "learning_rate": 9.49006143110233e-06, + "loss": 0.4156, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.29737459640312214, + "learning_rate": 9.453755102757168e-06, + "loss": 0.493, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.2728926853035778, + "learning_rate": 9.4162835846357e-06, + "loss": 0.4392, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.3272565049177079, + "learning_rate": 9.377657919766307e-06, + "loss": 0.491, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.2822226280837269, + "learning_rate": 9.33788949130972e-06, + "loss": 0.4266, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.3438862449071104, + "learning_rate": 9.296990019204336e-06, + "loss": 0.4819, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.3322076503353557, + "learning_rate": 9.254971556712314e-06, + "loss": 0.4597, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.3262451955412676, + "learning_rate": 9.21184648686741e-06, + "loss": 0.4891, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.27720763525504155, + "learning_rate": 9.167627518825651e-06, + "loss": 0.4262, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.291234979530001, + "learning_rate": 9.122327684119883e-06, + "loss": 0.3755, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.2910628070235976, + "learning_rate": 9.075960332819314e-06, + "loss": 0.4294, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.3184275314698029, + "learning_rate": 9.028539129595199e-06, + "loss": 0.5769, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.2722946981234995, + "learning_rate": 8.980078049693785e-06, + "loss": 0.4019, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.29789872299343384, + "learning_rate": 8.930591374817757e-06, + "loss": 0.4095, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.3287733650721115, + "learning_rate": 8.88009368891734e-06, + "loss": 0.5409, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.30019238586389524, + "learning_rate": 8.828599873892351e-06, + "loss": 0.4375, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.31598561474060005, + "learning_rate": 8.776125105206433e-06, + "loss": 0.4902, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.31172751963714107, + "learning_rate": 8.722684847414771e-06, + "loss": 0.5176, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.2807830457433651, + "learning_rate": 8.668294849606626e-06, + "loss": 0.4707, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.2960941109569259, + "learning_rate": 8.612971140764e-06, + "loss": 0.4283, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.3299368286152574, + "learning_rate": 8.556730025037819e-06, + "loss": 0.4088, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.26428803479637797, + "learning_rate": 8.499588076943036e-06, + "loss": 0.3912, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.3009234600421325, + "learning_rate": 8.441562136474028e-06, + "loss": 0.4067, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.3609455923426209, + "learning_rate": 8.38266930414179e-06, + "loss": 0.4898, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.31387386515515087, + "learning_rate": 8.322926935934323e-06, + "loss": 0.4282, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.2845268045281065, + "learning_rate": 8.262352638201754e-06, + "loss": 0.4841, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.30381093535089976, + "learning_rate": 8.200964262467658e-06, + "loss": 0.5444, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.27310289060720006, + "learning_rate": 8.13877990016813e-06, + "loss": 0.341, + "step": 62 + }, + { + "epoch": 1.0, + "grad_norm": 0.27310289060720006, + "learning_rate": 8.075817877320167e-06, + "loss": 0.4322, + "step": 63 + }, + { + "epoch": 1.016, + "grad_norm": 0.48420660950568, + "learning_rate": 8.012096749120892e-06, + "loss": 0.4956, + "step": 64 + }, + { + "epoch": 1.032, + "grad_norm": 0.2856122575450543, + "learning_rate": 7.947635294479262e-06, + "loss": 0.3808, + "step": 65 + }, + { + "epoch": 1.048, + "grad_norm": 0.2959094703912616, + "learning_rate": 7.882452510481834e-06, + "loss": 0.4794, + "step": 66 + }, + { + "epoch": 1.064, + "grad_norm": 0.2587455831416144, + "learning_rate": 7.81656760679424e-06, + "loss": 0.4343, + "step": 67 + }, + { + "epoch": 1.08, + "grad_norm": 0.2789553742202286, + "learning_rate": 7.75e-06, + "loss": 0.4484, + "step": 68 + }, + { + "epoch": 1.096, + "grad_norm": 0.3039866797711843, + "learning_rate": 7.682769307878384e-06, + "loss": 0.4165, + "step": 69 + }, + { + "epoch": 1.112, + "grad_norm": 0.2588236721874104, + "learning_rate": 7.614895343622941e-06, + "loss": 0.4181, + "step": 70 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.27938933998248044, + "learning_rate": 7.546398110002477e-06, + "loss": 0.4349, + "step": 71 + }, + { + "epoch": 1.144, + "grad_norm": 0.30500409100487047, + "learning_rate": 7.477297793466137e-06, + "loss": 0.4195, + "step": 72 + }, + { + "epoch": 1.16, + "grad_norm": 0.2703316954052495, + "learning_rate": 7.407614758194375e-06, + "loss": 0.4593, + "step": 73 + }, + { + "epoch": 1.176, + "grad_norm": 0.29674738729978956, + "learning_rate": 7.337369540097521e-06, + "loss": 0.4018, + "step": 74 + }, + { + "epoch": 1.192, + "grad_norm": 0.3434929792222651, + "learning_rate": 7.266582840763774e-06, + "loss": 0.4633, + "step": 75 + }, + { + "epoch": 1.208, + "grad_norm": 0.32994268144478006, + "learning_rate": 7.195275521358334e-06, + "loss": 0.4982, + "step": 76 + }, + { + "epoch": 1.224, + "grad_norm": 0.2751594270204829, + "learning_rate": 7.123468596475526e-06, + "loss": 0.4717, + "step": 77 + }, + { + "epoch": 1.24, + "grad_norm": 0.3277135583956103, + "learning_rate": 7.051183227945703e-06, + "loss": 0.3922, + "step": 78 + }, + { + "epoch": 1.256, + "grad_norm": 0.3178890245565396, + "learning_rate": 6.978440718598757e-06, + "loss": 0.4359, + "step": 79 + }, + { + "epoch": 1.272, + "grad_norm": 0.2936671959796726, + "learning_rate": 6.905262505986076e-06, + "loss": 0.4482, + "step": 80 + }, + { + "epoch": 1.288, + "grad_norm": 0.30577342188899176, + "learning_rate": 6.8316701560628015e-06, + "loss": 0.3457, + "step": 81 + }, + { + "epoch": 1.304, + "grad_norm": 0.25812738212578895, + "learning_rate": 6.757685356832243e-06, + "loss": 0.4501, + "step": 82 + }, + { + "epoch": 1.32, + "grad_norm": 0.31439500562242606, + "learning_rate": 6.683329911954316e-06, + "loss": 0.5262, + "step": 83 + }, + { + "epoch": 1.336, + "grad_norm": 0.3508949566807093, + "learning_rate": 6.608625734319917e-06, + "loss": 0.3605, + "step": 84 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.29111139241084, + "learning_rate": 6.5335948395930815e-06, + "loss": 0.4049, + "step": 85 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.30291591445235805, + "learning_rate": 6.458259339722871e-06, + "loss": 0.4092, + "step": 86 + }, + { + "epoch": 1.384, + "grad_norm": 0.29313552915613017, + "learning_rate": 6.382641436426887e-06, + "loss": 0.3528, + "step": 87 + }, + { + "epoch": 1.4, + "grad_norm": 0.2583754481347982, + "learning_rate": 6.306763414648311e-06, + "loss": 0.3559, + "step": 88 + }, + { + "epoch": 1.416, + "grad_norm": 0.26863426430444226, + "learning_rate": 6.230647635988437e-06, + "loss": 0.4254, + "step": 89 + }, + { + "epoch": 1.432, + "grad_norm": 0.3093298693816597, + "learning_rate": 6.154316532116605e-06, + "loss": 0.3579, + "step": 90 + }, + { + "epoch": 1.448, + "grad_norm": 0.25998129570390127, + "learning_rate": 6.0777925981594795e-06, + "loss": 0.4028, + "step": 91 + }, + { + "epoch": 1.464, + "grad_norm": 0.3033921125695614, + "learning_rate": 6.00109838607164e-06, + "loss": 0.3805, + "step": 92 + }, + { + "epoch": 1.48, + "grad_norm": 0.2904725832178519, + "learning_rate": 5.924256497989411e-06, + "loss": 0.4323, + "step": 93 + }, + { + "epoch": 1.496, + "grad_norm": 0.30355923070799345, + "learning_rate": 5.84728957956991e-06, + "loss": 0.3297, + "step": 94 + }, + { + "epoch": 1.512, + "grad_norm": 0.2705231290116716, + "learning_rate": 5.770220313317269e-06, + "loss": 0.4177, + "step": 95 + }, + { + "epoch": 1.528, + "grad_norm": 0.337520528610411, + "learning_rate": 5.693071411897996e-06, + "loss": 0.3795, + "step": 96 + }, + { + "epoch": 1.544, + "grad_norm": 0.2741225861592298, + "learning_rate": 5.61586561144745e-06, + "loss": 0.414, + "step": 97 + }, + { + "epoch": 1.56, + "grad_norm": 0.3099099396586251, + "learning_rate": 5.538625664869393e-06, + "loss": 0.3715, + "step": 98 + }, + { + "epoch": 1.576, + "grad_norm": 0.28067657578779776, + "learning_rate": 5.46137433513061e-06, + "loss": 0.4295, + "step": 99 + }, + { + "epoch": 1.592, + "grad_norm": 0.2290691740751286, + "learning_rate": 5.384134388552552e-06, + "loss": 0.3913, + "step": 100 + }, + { + "epoch": 1.608, + "grad_norm": 0.292339568550721, + "learning_rate": 5.306928588102005e-06, + "loss": 0.4101, + "step": 101 + }, + { + "epoch": 1.624, + "grad_norm": 0.26499794509502683, + "learning_rate": 5.229779686682734e-06, + "loss": 0.4146, + "step": 102 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.3102018761949683, + "learning_rate": 5.152710420430092e-06, + "loss": 0.43, + "step": 103 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.28357637434683114, + "learning_rate": 5.0757435020105905e-06, + "loss": 0.4247, + "step": 104 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.2829919630781859, + "learning_rate": 4.998901613928361e-06, + "loss": 0.4038, + "step": 105 + }, + { + "epoch": 1.688, + "grad_norm": 0.29284800214525813, + "learning_rate": 4.922207401840521e-06, + "loss": 0.3709, + "step": 106 + }, + { + "epoch": 1.704, + "grad_norm": 0.3170342016576189, + "learning_rate": 4.845683467883396e-06, + "loss": 0.4447, + "step": 107 + }, + { + "epoch": 1.72, + "grad_norm": 0.3077243277320204, + "learning_rate": 4.7693523640115646e-06, + "loss": 0.3928, + "step": 108 + }, + { + "epoch": 1.736, + "grad_norm": 0.2778878775122821, + "learning_rate": 4.693236585351692e-06, + "loss": 0.4175, + "step": 109 + }, + { + "epoch": 1.752, + "grad_norm": 0.28795862892188867, + "learning_rate": 4.617358563573114e-06, + "loss": 0.3722, + "step": 110 + }, + { + "epoch": 1.768, + "grad_norm": 0.25342664678922827, + "learning_rate": 4.541740660277131e-06, + "loss": 0.39, + "step": 111 + }, + { + "epoch": 1.784, + "grad_norm": 0.29510553890778607, + "learning_rate": 4.466405160406922e-06, + "loss": 0.4833, + "step": 112 + }, + { + "epoch": 1.8, + "grad_norm": 0.2793848028342464, + "learning_rate": 4.391374265680084e-06, + "loss": 0.3726, + "step": 113 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.3026649324366544, + "learning_rate": 4.316670088045684e-06, + "loss": 0.4718, + "step": 114 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.27728466637557236, + "learning_rate": 4.242314643167759e-06, + "loss": 0.3371, + "step": 115 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.3168390463410005, + "learning_rate": 4.168329843937199e-06, + "loss": 0.3784, + "step": 116 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.23724238568345682, + "learning_rate": 4.094737494013925e-06, + "loss": 0.4107, + "step": 117 + }, + { + "epoch": 1.88, + "grad_norm": 0.3236361234128115, + "learning_rate": 4.0215592814012435e-06, + "loss": 0.4976, + "step": 118 + }, + { + "epoch": 1.896, + "grad_norm": 0.30334182027072204, + "learning_rate": 3.948816772054298e-06, + "loss": 0.472, + "step": 119 + }, + { + "epoch": 1.912, + "grad_norm": 0.2680853511924467, + "learning_rate": 3.876531403524476e-06, + "loss": 0.3274, + "step": 120 + }, + { + "epoch": 1.928, + "grad_norm": 0.30193319196237894, + "learning_rate": 3.804724478641667e-06, + "loss": 0.3816, + "step": 121 + }, + { + "epoch": 1.944, + "grad_norm": 0.26693316547906853, + "learning_rate": 3.733417159236228e-06, + "loss": 0.3598, + "step": 122 + }, + { + "epoch": 1.96, + "grad_norm": 0.2717103343613883, + "learning_rate": 3.6626304599024797e-06, + "loss": 0.3841, + "step": 123 + }, + { + "epoch": 1.976, + "grad_norm": 0.31425281419423284, + "learning_rate": 3.592385241805628e-06, + "loss": 0.4168, + "step": 124 + }, + { + "epoch": 1.992, + "grad_norm": 0.25599583818936533, + "learning_rate": 3.5227022065338623e-06, + "loss": 0.4434, + "step": 125 + }, + { + "epoch": 2.0, + "grad_norm": 0.531221874086192, + "learning_rate": 3.4536018899975255e-06, + "loss": 0.3939, + "step": 126 + }, + { + "epoch": 2.016, + "grad_norm": 0.26919931935198743, + "learning_rate": 3.3851046563770617e-06, + "loss": 0.3551, + "step": 127 + }, + { + "epoch": 2.032, + "grad_norm": 0.2820331296888726, + "learning_rate": 3.317230692121618e-06, + "loss": 0.3248, + "step": 128 + }, + { + "epoch": 2.048, + "grad_norm": 0.2560275313368148, + "learning_rate": 3.2500000000000015e-06, + "loss": 0.3168, + "step": 129 + }, + { + "epoch": 2.064, + "grad_norm": 0.26575624913648593, + "learning_rate": 3.1834323932057633e-06, + "loss": 0.3388, + "step": 130 + }, + { + "epoch": 2.08, + "grad_norm": 0.2882413409588101, + "learning_rate": 3.117547489518167e-06, + "loss": 0.416, + "step": 131 + }, + { + "epoch": 2.096, + "grad_norm": 0.26749471747908243, + "learning_rate": 3.0523647055207393e-06, + "loss": 0.4344, + "step": 132 + }, + { + "epoch": 2.112, + "grad_norm": 0.2770415005714329, + "learning_rate": 2.9879032508791096e-06, + "loss": 0.4068, + "step": 133 + }, + { + "epoch": 2.128, + "grad_norm": 0.25222155690032644, + "learning_rate": 2.9241821226798338e-06, + "loss": 0.3379, + "step": 134 + }, + { + "epoch": 2.144, + "grad_norm": 0.292188926393636, + "learning_rate": 2.86122009983187e-06, + "loss": 0.3774, + "step": 135 + }, + { + "epoch": 2.16, + "grad_norm": 0.2898156735833746, + "learning_rate": 2.799035737532344e-06, + "loss": 0.3845, + "step": 136 + }, + { + "epoch": 2.176, + "grad_norm": 0.27462927630805695, + "learning_rate": 2.7376473617982456e-06, + "loss": 0.3899, + "step": 137 + }, + { + "epoch": 2.192, + "grad_norm": 0.28037858694245654, + "learning_rate": 2.6770730640656784e-06, + "loss": 0.4171, + "step": 138 + }, + { + "epoch": 2.208, + "grad_norm": 0.2791430022438955, + "learning_rate": 2.6173306958582125e-06, + "loss": 0.3156, + "step": 139 + }, + { + "epoch": 2.224, + "grad_norm": 0.29137935108000546, + "learning_rate": 2.5584378635259733e-06, + "loss": 0.3753, + "step": 140 + }, + { + "epoch": 2.24, + "grad_norm": 0.2593660029080348, + "learning_rate": 2.5004119230569655e-06, + "loss": 0.3766, + "step": 141 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 0.2875413507661142, + "learning_rate": 2.4432699749621813e-06, + "loss": 0.3967, + "step": 142 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.2931955054483654, + "learning_rate": 2.387028859236002e-06, + "loss": 0.3498, + "step": 143 + }, + { + "epoch": 2.288, + "grad_norm": 0.26403010549011774, + "learning_rate": 2.3317051503933743e-06, + "loss": 0.3428, + "step": 144 + }, + { + "epoch": 2.304, + "grad_norm": 0.2777071852283896, + "learning_rate": 2.2773151525852313e-06, + "loss": 0.3777, + "step": 145 + }, + { + "epoch": 2.32, + "grad_norm": 0.29111968942296174, + "learning_rate": 2.223874894793569e-06, + "loss": 0.4267, + "step": 146 + }, + { + "epoch": 2.336, + "grad_norm": 0.30801800316293676, + "learning_rate": 2.17140012610765e-06, + "loss": 0.3731, + "step": 147 + }, + { + "epoch": 2.352, + "grad_norm": 0.2950618309605883, + "learning_rate": 2.119906311082662e-06, + "loss": 0.4385, + "step": 148 + }, + { + "epoch": 2.368, + "grad_norm": 0.24715278994212442, + "learning_rate": 2.069408625182244e-06, + "loss": 0.3629, + "step": 149 + }, + { + "epoch": 2.384, + "grad_norm": 0.2742573979067362, + "learning_rate": 2.019921950306216e-06, + "loss": 0.381, + "step": 150 + }, + { + "epoch": 2.4, + "grad_norm": 0.28689018960107904, + "learning_rate": 1.9714608704048036e-06, + "loss": 0.4096, + "step": 151 + }, + { + "epoch": 2.416, + "grad_norm": 0.2624247476439798, + "learning_rate": 1.924039667180687e-06, + "loss": 0.3014, + "step": 152 + }, + { + "epoch": 2.432, + "grad_norm": 0.269857690537919, + "learning_rate": 1.8776723158801188e-06, + "loss": 0.3713, + "step": 153 + }, + { + "epoch": 2.448, + "grad_norm": 0.3457149846408549, + "learning_rate": 1.8323724811743495e-06, + "loss": 0.4165, + "step": 154 + }, + { + "epoch": 2.464, + "grad_norm": 0.31737427991855693, + "learning_rate": 1.78815351313259e-06, + "loss": 0.4442, + "step": 155 + }, + { + "epoch": 2.48, + "grad_norm": 0.3148729894178642, + "learning_rate": 1.7450284432876873e-06, + "loss": 0.4802, + "step": 156 + }, + { + "epoch": 2.496, + "grad_norm": 0.26484117644858995, + "learning_rate": 1.7030099807956649e-06, + "loss": 0.3706, + "step": 157 + }, + { + "epoch": 2.512, + "grad_norm": 0.2923543941519516, + "learning_rate": 1.6621105086902822e-06, + "loss": 0.3906, + "step": 158 + }, + { + "epoch": 2.528, + "grad_norm": 0.2787856319176262, + "learning_rate": 1.6223420802336933e-06, + "loss": 0.3995, + "step": 159 + }, + { + "epoch": 2.544, + "grad_norm": 0.2648560287594022, + "learning_rate": 1.5837164153643014e-06, + "loss": 0.3437, + "step": 160 + }, + { + "epoch": 2.56, + "grad_norm": 0.28029815338389324, + "learning_rate": 1.5462448972428334e-06, + "loss": 0.4114, + "step": 161 + }, + { + "epoch": 2.576, + "grad_norm": 0.32764654451552816, + "learning_rate": 1.5099385688976695e-06, + "loss": 0.4176, + "step": 162 + }, + { + "epoch": 2.592, + "grad_norm": 0.26653675081715944, + "learning_rate": 1.474808129970421e-06, + "loss": 0.4394, + "step": 163 + }, + { + "epoch": 2.608, + "grad_norm": 0.30303023697633824, + "learning_rate": 1.4408639335626823e-06, + "loss": 0.446, + "step": 164 + }, + { + "epoch": 2.624, + "grad_norm": 0.30230020169875327, + "learning_rate": 1.4081159831849395e-06, + "loss": 0.3978, + "step": 165 + }, + { + "epoch": 2.64, + "grad_norm": 0.3069707267289378, + "learning_rate": 1.3765739298084792e-06, + "loss": 0.3623, + "step": 166 + }, + { + "epoch": 2.656, + "grad_norm": 0.27712650857341004, + "learning_rate": 1.346247069021208e-06, + "loss": 0.3382, + "step": 167 + }, + { + "epoch": 2.672, + "grad_norm": 0.2699015904710055, + "learning_rate": 1.3171443382881993e-06, + "loss": 0.4117, + "step": 168 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 0.29159348133947205, + "learning_rate": 1.2892743143177793e-06, + "loss": 0.3288, + "step": 169 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 0.24614166945136878, + "learning_rate": 1.262645210533934e-06, + "loss": 0.4084, + "step": 170 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.2758825153172464, + "learning_rate": 1.2372648746557742e-06, + "loss": 0.3513, + "step": 171 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 0.29799910395406354, + "learning_rate": 1.213140786384779e-06, + "loss": 0.386, + "step": 172 + }, + { + "epoch": 2.752, + "grad_norm": 0.32545648563171126, + "learning_rate": 1.190280055200489e-06, + "loss": 0.4041, + "step": 173 + }, + { + "epoch": 2.768, + "grad_norm": 0.28037781950139645, + "learning_rate": 1.1686894182653137e-06, + "loss": 0.4193, + "step": 174 + }, + { + "epoch": 2.784, + "grad_norm": 0.269297811870558, + "learning_rate": 1.1483752384390583e-06, + "loss": 0.3404, + "step": 175 + }, + { + "epoch": 2.8, + "grad_norm": 0.2621451208629847, + "learning_rate": 1.1293435024037592e-06, + "loss": 0.4476, + "step": 176 + }, + { + "epoch": 2.816, + "grad_norm": 0.2840465963543336, + "learning_rate": 1.1115998188993788e-06, + "loss": 0.3689, + "step": 177 + }, + { + "epoch": 2.832, + "grad_norm": 0.30328731810375564, + "learning_rate": 1.09514941707089e-06, + "loss": 0.4219, + "step": 178 + }, + { + "epoch": 2.848, + "grad_norm": 0.28062525330655147, + "learning_rate": 1.0799971449272174e-06, + "loss": 0.3932, + "step": 179 + }, + { + "epoch": 2.864, + "grad_norm": 0.28452098866262326, + "learning_rate": 1.0661474679125096e-06, + "loss": 0.4198, + "step": 180 + }, + { + "epoch": 2.88, + "grad_norm": 0.29354727351428, + "learning_rate": 1.0536044675901534e-06, + "loss": 0.3813, + "step": 181 + }, + { + "epoch": 2.896, + "grad_norm": 0.3093060506201193, + "learning_rate": 1.0423718404399139e-06, + "loss": 0.4137, + "step": 182 + }, + { + "epoch": 2.912, + "grad_norm": 0.2536429694083317, + "learning_rate": 1.0324528967685698e-06, + "loss": 0.4057, + "step": 183 + }, + { + "epoch": 2.928, + "grad_norm": 0.278656315909379, + "learning_rate": 1.0238505597343494e-06, + "loss": 0.3078, + "step": 184 + }, + { + "epoch": 2.944, + "grad_norm": 0.26173005919140585, + "learning_rate": 1.0165673644854601e-06, + "loss": 0.4326, + "step": 185 + }, + { + "epoch": 2.96, + "grad_norm": 0.29711263236623364, + "learning_rate": 1.0106054574129717e-06, + "loss": 0.4436, + "step": 186 + }, + { + "epoch": 2.976, + "grad_norm": 0.2412655286518132, + "learning_rate": 1.0059665955182629e-06, + "loss": 0.3899, + "step": 187 + }, + { + "epoch": 2.992, + "grad_norm": 0.2744645092683126, + "learning_rate": 1.0026521458952265e-06, + "loss": 0.3335, + "step": 188 + }, + { + "epoch": 3.0, + "grad_norm": 0.2744645092683126, + "learning_rate": 1.0006630853273791e-06, + "loss": 0.395, + "step": 189 + }, + { + "epoch": 3.0, + "step": 189, + "total_flos": 1.847238457911296e+16, + "train_loss": 0.4201909417197818, + "train_runtime": 1415.4735, + "train_samples_per_second": 2.119, + "train_steps_per_second": 0.134 + } + ], + "logging_steps": 1, + "max_steps": 189, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.847238457911296e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..26d0f86 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbef41df8cde9c7001e2b13cc39db42c85e9f22a372b394fc3822a772698573c +size 9528