From 9e60c86959892ec2ecebb906d6d773f9beb96233 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 22 Apr 2026 19:47:57 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: phanviethoang1512/llama3.2-1b-deita-dpo-student_sft_init Source: Original Platform --- .gitattributes | 36 + README.md | 69 ++ all_results.json | 14 + chat_template.jinja | 5 + config.json | 35 + eval_results.json | 8 + generation_config.json | 9 + model.safetensors | 3 + special_tokens_map.json | 17 + tokenizer.json | 3 + tokenizer_config.json | 2063 ++++++++++++++++++++++++++++++++ train_results.json | 9 + trainer_state.json | 2461 +++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 14 files changed, 4735 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 eval_results.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..f2a84c5 --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +--- +library_name: transformers +license: llama3.2 +base_model: meta-llama/Llama-3.2-1B +tags: +- alignment-handbook +- generated_from_trainer +datasets: +- HuggingFaceH4/deita-10k-v0-sft +model-index: +- name: student_sft_init + results: [] +--- + + + +# student_sft_init + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the HuggingFaceH4/deita-10k-v0-sft dataset. +It achieves the following results on the evaluation set: +- Loss: 1.1767 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 2e-05 +- train_batch_size: 4 +- eval_batch_size: 1 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 64 +- total_eval_batch_size: 4 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 3 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:-----:|:----:|:---------------:| +| 1.1293 | 1.0 | 571 | 1.1598 | +| 0.9276 | 2.0 | 1142 | 1.1511 | +| 0.8279 | 3.0 | 1713 | 1.1767 | + + +### Framework versions + +- Transformers 4.52.4 +- Pytorch 2.10.0+cu126 +- Datasets 4.8.4 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..1655c5b --- /dev/null +++ b/all_results.json @@ -0,0 +1,14 @@ +{ + "epoch": 3.0, + "eval_loss": 1.1766911745071411, + "eval_runtime": 199.3521, + "eval_samples": 500, + "eval_samples_per_second": 9.165, + "eval_steps_per_second": 2.292, + "total_flos": 90953314467840.0, + "train_loss": 0.9790556504583052, + "train_runtime": 12705.6497, + "train_samples": 9500, + "train_samples_per_second": 8.618, + "train_steps_per_second": 0.135 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..39bd0c9 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..74fdb86 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.4", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..6a81749 --- /dev/null +++ b/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "eval_loss": 1.1766911745071411, + "eval_runtime": 199.3521, + "eval_samples": 500, + "eval_samples_per_second": 9.165, + "eval_steps_per_second": 2.292 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..27a9e4c --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.52.4" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..1de0c84 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b18096646f1d5c4279558dee9861b4dd6c69e79a680a49399df75f29a9792b +size 2471645608 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..04829af --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..a0e9d22 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 2048, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..4f778eb --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 3.0, + "total_flos": 90953314467840.0, + "train_loss": 0.9790556504583052, + "train_runtime": 12705.6497, + "train_samples": 9500, + "train_samples_per_second": 8.618, + "train_steps_per_second": 0.135 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..9f5f2d4 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2461 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1713, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008764241893076249, + "grad_norm": 5.367548942565918, + "learning_rate": 4.651162790697675e-07, + "loss": 1.491, + "step": 5 + }, + { + "epoch": 0.017528483786152498, + "grad_norm": 3.861912488937378, + "learning_rate": 1.0465116279069768e-06, + "loss": 1.4607, + "step": 10 + }, + { + "epoch": 0.026292725679228746, + "grad_norm": 2.542222738265991, + "learning_rate": 1.6279069767441862e-06, + "loss": 1.4704, + "step": 15 + }, + { + "epoch": 0.035056967572304996, + "grad_norm": 2.1328587532043457, + "learning_rate": 2.2093023255813954e-06, + "loss": 1.4085, + "step": 20 + }, + { + "epoch": 0.04382120946538125, + "grad_norm": 1.5579408407211304, + "learning_rate": 2.790697674418605e-06, + "loss": 1.3603, + "step": 25 + }, + { + "epoch": 0.05258545135845749, + "grad_norm": 1.6026604175567627, + "learning_rate": 3.372093023255814e-06, + "loss": 1.3568, + "step": 30 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 1.5183006525039673, + "learning_rate": 3.953488372093024e-06, + "loss": 1.3702, + "step": 35 + }, + { + "epoch": 0.07011393514460999, + "grad_norm": 1.416035532951355, + "learning_rate": 4.5348837209302326e-06, + "loss": 1.3288, + "step": 40 + }, + { + "epoch": 0.07887817703768624, + "grad_norm": 1.4895626306533813, + "learning_rate": 5.116279069767442e-06, + "loss": 1.3292, + "step": 45 + }, + { + "epoch": 0.0876424189307625, + "grad_norm": 1.3430354595184326, + "learning_rate": 5.697674418604652e-06, + "loss": 1.3227, + "step": 50 + }, + { + "epoch": 0.09640666082383874, + "grad_norm": 1.4117517471313477, + "learning_rate": 6.279069767441861e-06, + "loss": 1.2902, + "step": 55 + }, + { + "epoch": 0.10517090271691498, + "grad_norm": 1.3359665870666504, + "learning_rate": 6.86046511627907e-06, + "loss": 1.3327, + "step": 60 + }, + { + "epoch": 0.11393514460999124, + "grad_norm": 1.4718199968338013, + "learning_rate": 7.44186046511628e-06, + "loss": 1.2973, + "step": 65 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 1.2470380067825317, + "learning_rate": 8.023255813953488e-06, + "loss": 1.2706, + "step": 70 + }, + { + "epoch": 0.13146362839614373, + "grad_norm": 1.324803352355957, + "learning_rate": 8.604651162790698e-06, + "loss": 1.2178, + "step": 75 + }, + { + "epoch": 0.14022787028921999, + "grad_norm": 1.3574628829956055, + "learning_rate": 9.186046511627908e-06, + "loss": 1.2316, + "step": 80 + }, + { + "epoch": 0.14899211218229624, + "grad_norm": 1.3636841773986816, + "learning_rate": 9.767441860465117e-06, + "loss": 1.283, + "step": 85 + }, + { + "epoch": 0.15775635407537247, + "grad_norm": 1.7021708488464355, + "learning_rate": 1.0348837209302327e-05, + "loss": 1.2635, + "step": 90 + }, + { + "epoch": 0.16652059596844873, + "grad_norm": 1.243608832359314, + "learning_rate": 1.0930232558139535e-05, + "loss": 1.2079, + "step": 95 + }, + { + "epoch": 0.175284837861525, + "grad_norm": 1.8144162893295288, + "learning_rate": 1.1511627906976746e-05, + "loss": 1.2186, + "step": 100 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 1.1823457479476929, + "learning_rate": 1.2093023255813954e-05, + "loss": 1.2103, + "step": 105 + }, + { + "epoch": 0.19281332164767748, + "grad_norm": 1.198132872581482, + "learning_rate": 1.2674418604651164e-05, + "loss": 1.2044, + "step": 110 + }, + { + "epoch": 0.20157756354075373, + "grad_norm": 11.093875885009766, + "learning_rate": 1.3255813953488372e-05, + "loss": 1.1683, + "step": 115 + }, + { + "epoch": 0.21034180543382996, + "grad_norm": 1.0984971523284912, + "learning_rate": 1.3837209302325583e-05, + "loss": 1.2289, + "step": 120 + }, + { + "epoch": 0.21910604732690622, + "grad_norm": 1.2427825927734375, + "learning_rate": 1.441860465116279e-05, + "loss": 1.1449, + "step": 125 + }, + { + "epoch": 0.22787028921998248, + "grad_norm": 1.2608261108398438, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.1524, + "step": 130 + }, + { + "epoch": 0.2366345311130587, + "grad_norm": 1.114823818206787, + "learning_rate": 1.558139534883721e-05, + "loss": 1.1769, + "step": 135 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 1.1239306926727295, + "learning_rate": 1.616279069767442e-05, + "loss": 1.1516, + "step": 140 + }, + { + "epoch": 0.2541630148992112, + "grad_norm": 1.1203042268753052, + "learning_rate": 1.674418604651163e-05, + "loss": 1.1243, + "step": 145 + }, + { + "epoch": 0.26292725679228746, + "grad_norm": 1.0693674087524414, + "learning_rate": 1.7325581395348837e-05, + "loss": 1.1574, + "step": 150 + }, + { + "epoch": 0.27169149868536374, + "grad_norm": 1.1013996601104736, + "learning_rate": 1.790697674418605e-05, + "loss": 1.1621, + "step": 155 + }, + { + "epoch": 0.28045574057843997, + "grad_norm": 1.1914992332458496, + "learning_rate": 1.8488372093023256e-05, + "loss": 1.1468, + "step": 160 + }, + { + "epoch": 0.2892199824715162, + "grad_norm": 1.1144826412200928, + "learning_rate": 1.9069767441860468e-05, + "loss": 1.153, + "step": 165 + }, + { + "epoch": 0.2979842243645925, + "grad_norm": 1.1576107740402222, + "learning_rate": 1.9651162790697676e-05, + "loss": 1.151, + "step": 170 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 1.037223219871521, + "learning_rate": 1.999991687649223e-05, + "loss": 1.1386, + "step": 175 + }, + { + "epoch": 0.31551270815074495, + "grad_norm": 1.1823593378067017, + "learning_rate": 1.999898175290004e-05, + "loss": 1.1368, + "step": 180 + }, + { + "epoch": 0.32427695004382123, + "grad_norm": 1.0528627634048462, + "learning_rate": 1.9997007698817558e-05, + "loss": 1.183, + "step": 185 + }, + { + "epoch": 0.33304119193689746, + "grad_norm": 1.1595643758773804, + "learning_rate": 1.9993994919356167e-05, + "loss": 1.1687, + "step": 190 + }, + { + "epoch": 0.3418054338299737, + "grad_norm": 1.0715525150299072, + "learning_rate": 1.9989943727554597e-05, + "loss": 1.1648, + "step": 195 + }, + { + "epoch": 0.35056967572305, + "grad_norm": 1.0216760635375977, + "learning_rate": 1.9984854544346367e-05, + "loss": 1.1587, + "step": 200 + }, + { + "epoch": 0.3593339176161262, + "grad_norm": 1.0617367029190063, + "learning_rate": 1.9978727898516087e-05, + "loss": 1.145, + "step": 205 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 1.0882047414779663, + "learning_rate": 1.997156442664449e-05, + "loss": 1.1652, + "step": 210 + }, + { + "epoch": 0.3768624014022787, + "grad_norm": 1.154795527458191, + "learning_rate": 1.9963364873042298e-05, + "loss": 1.135, + "step": 215 + }, + { + "epoch": 0.38562664329535495, + "grad_norm": 1.0904324054718018, + "learning_rate": 1.9954130089672893e-05, + "loss": 1.1262, + "step": 220 + }, + { + "epoch": 0.3943908851884312, + "grad_norm": 1.0021111965179443, + "learning_rate": 1.994386103606377e-05, + "loss": 1.1422, + "step": 225 + }, + { + "epoch": 0.40315512708150747, + "grad_norm": 1.0234806537628174, + "learning_rate": 1.9932558779206873e-05, + "loss": 1.1315, + "step": 230 + }, + { + "epoch": 0.4119193689745837, + "grad_norm": 1.10641610622406, + "learning_rate": 1.9920224493447702e-05, + "loss": 1.1824, + "step": 235 + }, + { + "epoch": 0.42068361086765993, + "grad_norm": 1.0238338708877563, + "learning_rate": 1.9906859460363307e-05, + "loss": 1.1442, + "step": 240 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 0.990487813949585, + "learning_rate": 1.989246506862913e-05, + "loss": 1.1276, + "step": 245 + }, + { + "epoch": 0.43821209465381245, + "grad_norm": 20.3802490234375, + "learning_rate": 1.9877042813874712e-05, + "loss": 1.1744, + "step": 250 + }, + { + "epoch": 0.4469763365468887, + "grad_norm": 1.1273601055145264, + "learning_rate": 1.9860594298528283e-05, + "loss": 1.1774, + "step": 255 + }, + { + "epoch": 0.45574057843996496, + "grad_norm": 1.014085292816162, + "learning_rate": 1.984312123165028e-05, + "loss": 1.162, + "step": 260 + }, + { + "epoch": 0.4645048203330412, + "grad_norm": 1.0875205993652344, + "learning_rate": 1.982462542875576e-05, + "loss": 1.1485, + "step": 265 + }, + { + "epoch": 0.4732690622261174, + "grad_norm": 1.0361530780792236, + "learning_rate": 1.9805108811625774e-05, + "loss": 1.1422, + "step": 270 + }, + { + "epoch": 0.4820333041191937, + "grad_norm": 1.0539902448654175, + "learning_rate": 1.9784573408107657e-05, + "loss": 1.0915, + "step": 275 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 1.05149245262146, + "learning_rate": 1.976302135190436e-05, + "loss": 1.1372, + "step": 280 + }, + { + "epoch": 0.49956178790534617, + "grad_norm": 1.0928102731704712, + "learning_rate": 1.9740454882352733e-05, + "loss": 1.1239, + "step": 285 + }, + { + "epoch": 0.5083260297984225, + "grad_norm": 1.0785322189331055, + "learning_rate": 1.971687634419086e-05, + "loss": 1.1429, + "step": 290 + }, + { + "epoch": 0.5170902716914987, + "grad_norm": 1.020357370376587, + "learning_rate": 1.9692288187314423e-05, + "loss": 1.1195, + "step": 295 + }, + { + "epoch": 0.5258545135845749, + "grad_norm": 0.9896298050880432, + "learning_rate": 1.9666692966522144e-05, + "loss": 1.1217, + "step": 300 + }, + { + "epoch": 0.5346187554776511, + "grad_norm": 0.9637587070465088, + "learning_rate": 1.9640093341250356e-05, + "loss": 1.1082, + "step": 305 + }, + { + "epoch": 0.5433829973707275, + "grad_norm": 1.2339686155319214, + "learning_rate": 1.961249207529665e-05, + "loss": 1.1459, + "step": 310 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 1.0626837015151978, + "learning_rate": 1.9583892036532726e-05, + "loss": 1.1257, + "step": 315 + }, + { + "epoch": 0.5609114811568799, + "grad_norm": 1.0179359912872314, + "learning_rate": 1.9554296196606395e-05, + "loss": 1.1111, + "step": 320 + }, + { + "epoch": 0.5696757230499562, + "grad_norm": 1.0226428508758545, + "learning_rate": 1.9523707630632834e-05, + "loss": 1.1673, + "step": 325 + }, + { + "epoch": 0.5784399649430324, + "grad_norm": 1.0737133026123047, + "learning_rate": 1.9492129516875055e-05, + "loss": 1.1325, + "step": 330 + }, + { + "epoch": 0.5872042068361086, + "grad_norm": 1.0531032085418701, + "learning_rate": 1.9459565136413667e-05, + "loss": 1.1478, + "step": 335 + }, + { + "epoch": 0.595968448729185, + "grad_norm": 1.0400668382644653, + "learning_rate": 1.942601787280598e-05, + "loss": 1.1403, + "step": 340 + }, + { + "epoch": 0.6047326906222612, + "grad_norm": 0.9359525442123413, + "learning_rate": 1.9391491211734426e-05, + "loss": 1.1298, + "step": 345 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 3.9531524181365967, + "learning_rate": 1.935598874064438e-05, + "loss": 1.1923, + "step": 350 + }, + { + "epoch": 0.6222611744084137, + "grad_norm": 1.0364443063735962, + "learning_rate": 1.9319514148371436e-05, + "loss": 1.1096, + "step": 355 + }, + { + "epoch": 0.6310254163014899, + "grad_norm": 1.0656158924102783, + "learning_rate": 1.9282071224758092e-05, + "loss": 1.1282, + "step": 360 + }, + { + "epoch": 0.6397896581945661, + "grad_norm": 1.0614289045333862, + "learning_rate": 1.9243663860259992e-05, + "loss": 1.1137, + "step": 365 + }, + { + "epoch": 0.6485539000876425, + "grad_norm": 1.002898931503296, + "learning_rate": 1.9204296045541686e-05, + "loss": 1.1091, + "step": 370 + }, + { + "epoch": 0.6573181419807187, + "grad_norm": 1.0451066493988037, + "learning_rate": 1.916397187106199e-05, + "loss": 1.0919, + "step": 375 + }, + { + "epoch": 0.6660823838737949, + "grad_norm": 1.192143201828003, + "learning_rate": 1.9122695526648968e-05, + "loss": 1.1581, + "step": 380 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 1.0061026811599731, + "learning_rate": 1.90804713010646e-05, + "loss": 1.116, + "step": 385 + }, + { + "epoch": 0.6836108676599474, + "grad_norm": 2.3462023735046387, + "learning_rate": 1.9037303581559143e-05, + "loss": 1.1323, + "step": 390 + }, + { + "epoch": 0.6923751095530236, + "grad_norm": 0.9700145125389099, + "learning_rate": 1.899319685341532e-05, + "loss": 1.1075, + "step": 395 + }, + { + "epoch": 0.7011393514461, + "grad_norm": 0.9761490821838379, + "learning_rate": 1.8948155699482243e-05, + "loss": 1.1291, + "step": 400 + }, + { + "epoch": 0.7099035933391762, + "grad_norm": 1.0112907886505127, + "learning_rate": 1.8902184799699265e-05, + "loss": 1.1087, + "step": 405 + }, + { + "epoch": 0.7186678352322524, + "grad_norm": 0.9741994738578796, + "learning_rate": 1.885528893060969e-05, + "loss": 1.1181, + "step": 410 + }, + { + "epoch": 0.7274320771253286, + "grad_norm": 0.9536153078079224, + "learning_rate": 1.8807472964864516e-05, + "loss": 1.114, + "step": 415 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.9664406180381775, + "learning_rate": 1.8758741870716093e-05, + "loss": 1.1474, + "step": 420 + }, + { + "epoch": 0.7449605609114811, + "grad_norm": 0.999437689781189, + "learning_rate": 1.8709100711501957e-05, + "loss": 1.1067, + "step": 425 + }, + { + "epoch": 0.7537248028045574, + "grad_norm": 1.0034998655319214, + "learning_rate": 1.865855464511869e-05, + "loss": 1.1409, + "step": 430 + }, + { + "epoch": 0.7624890446976337, + "grad_norm": 1.0300318002700806, + "learning_rate": 1.8607108923486025e-05, + "loss": 1.1289, + "step": 435 + }, + { + "epoch": 0.7712532865907099, + "grad_norm": 0.9994638562202454, + "learning_rate": 1.8554768892001137e-05, + "loss": 1.1093, + "step": 440 + }, + { + "epoch": 0.7800175284837861, + "grad_norm": 0.9193391799926758, + "learning_rate": 1.8501539988983234e-05, + "loss": 1.1377, + "step": 445 + }, + { + "epoch": 0.7887817703768624, + "grad_norm": 1.0165811777114868, + "learning_rate": 1.844742774510851e-05, + "loss": 1.1204, + "step": 450 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 0.9755986928939819, + "learning_rate": 1.8392437782835475e-05, + "loss": 1.0935, + "step": 455 + }, + { + "epoch": 0.8063102541630149, + "grad_norm": 0.977584183216095, + "learning_rate": 1.8336575815820764e-05, + "loss": 1.1064, + "step": 460 + }, + { + "epoch": 0.8150744960560912, + "grad_norm": 0.9432125687599182, + "learning_rate": 1.8279847648325478e-05, + "loss": 1.099, + "step": 465 + }, + { + "epoch": 0.8238387379491674, + "grad_norm": 1.0756127834320068, + "learning_rate": 1.822225917461208e-05, + "loss": 1.0926, + "step": 470 + }, + { + "epoch": 0.8326029798422436, + "grad_norm": 1.0018426179885864, + "learning_rate": 1.8163816378331983e-05, + "loss": 1.1292, + "step": 475 + }, + { + "epoch": 0.8413672217353199, + "grad_norm": 1.0097193717956543, + "learning_rate": 1.81045253319038e-05, + "loss": 1.0738, + "step": 480 + }, + { + "epoch": 0.8501314636283961, + "grad_norm": 0.9783721566200256, + "learning_rate": 1.8044392195882428e-05, + "loss": 1.1059, + "step": 485 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.9834737181663513, + "learning_rate": 1.7983423218318918e-05, + "loss": 1.1063, + "step": 490 + }, + { + "epoch": 0.8676599474145487, + "grad_norm": 1.0263484716415405, + "learning_rate": 1.7921624734111292e-05, + "loss": 1.1325, + "step": 495 + }, + { + "epoch": 0.8764241893076249, + "grad_norm": 0.9454247951507568, + "learning_rate": 1.7859003164346334e-05, + "loss": 1.0937, + "step": 500 + }, + { + "epoch": 0.8851884312007011, + "grad_norm": 1.006463646888733, + "learning_rate": 1.779556501563239e-05, + "loss": 1.0511, + "step": 505 + }, + { + "epoch": 0.8939526730937774, + "grad_norm": 6.430685043334961, + "learning_rate": 1.773131687942333e-05, + "loss": 1.0899, + "step": 510 + }, + { + "epoch": 0.9027169149868537, + "grad_norm": 1.3062087297439575, + "learning_rate": 1.7666265431333654e-05, + "loss": 1.1047, + "step": 515 + }, + { + "epoch": 0.9114811568799299, + "grad_norm": 1.0522316694259644, + "learning_rate": 1.76004174304449e-05, + "loss": 1.1009, + "step": 520 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 0.9894193410873413, + "learning_rate": 1.7533779718603315e-05, + "loss": 1.0761, + "step": 525 + }, + { + "epoch": 0.9290096406660824, + "grad_norm": 1.0116757154464722, + "learning_rate": 1.7466359219708987e-05, + "loss": 1.1305, + "step": 530 + }, + { + "epoch": 0.9377738825591586, + "grad_norm": 0.962745726108551, + "learning_rate": 1.739816293899642e-05, + "loss": 1.0758, + "step": 535 + }, + { + "epoch": 0.9465381244522348, + "grad_norm": 0.9733975529670715, + "learning_rate": 1.7329197962306666e-05, + "loss": 1.0752, + "step": 540 + }, + { + "epoch": 0.9553023663453112, + "grad_norm": 1.026983618736267, + "learning_rate": 1.7259471455351072e-05, + "loss": 1.0576, + "step": 545 + }, + { + "epoch": 0.9640666082383874, + "grad_norm": 0.9675541520118713, + "learning_rate": 1.718899066296675e-05, + "loss": 1.0759, + "step": 550 + }, + { + "epoch": 0.9728308501314636, + "grad_norm": 0.9842016100883484, + "learning_rate": 1.71177629083638e-05, + "loss": 1.0704, + "step": 555 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.9556295871734619, + "learning_rate": 1.7045795592364413e-05, + "loss": 1.1343, + "step": 560 + }, + { + "epoch": 0.9903593339176161, + "grad_norm": 1.033588171005249, + "learning_rate": 1.6973096192633884e-05, + "loss": 1.0947, + "step": 565 + }, + { + "epoch": 0.9991235758106923, + "grad_norm": 1.971771240234375, + "learning_rate": 1.6899672262903675e-05, + "loss": 1.1293, + "step": 570 + }, + { + "epoch": 1.0, + "eval_loss": 1.1597641706466675, + "eval_runtime": 199.4031, + "eval_samples_per_second": 9.162, + "eval_steps_per_second": 2.292, + "step": 571 + }, + { + "epoch": 1.007011393514461, + "grad_norm": 1.0958155393600464, + "learning_rate": 1.6825531432186545e-05, + "loss": 1.0193, + "step": 575 + }, + { + "epoch": 1.0157756354075373, + "grad_norm": 1.108912467956543, + "learning_rate": 1.6750681403983847e-05, + "loss": 0.9767, + "step": 580 + }, + { + "epoch": 1.0245398773006136, + "grad_norm": 1.0128231048583984, + "learning_rate": 1.6675129955485154e-05, + "loss": 0.9534, + "step": 585 + }, + { + "epoch": 1.0333041191936898, + "grad_norm": 0.9520951509475708, + "learning_rate": 1.659888493676013e-05, + "loss": 0.9388, + "step": 590 + }, + { + "epoch": 1.042068361086766, + "grad_norm": 0.98039710521698, + "learning_rate": 1.652195426994292e-05, + "loss": 0.97, + "step": 595 + }, + { + "epoch": 1.0508326029798423, + "grad_norm": 1.0683668851852417, + "learning_rate": 1.6444345948408985e-05, + "loss": 0.9539, + "step": 600 + }, + { + "epoch": 1.0595968448729185, + "grad_norm": 1.0525304079055786, + "learning_rate": 1.636606803594457e-05, + "loss": 0.9534, + "step": 605 + }, + { + "epoch": 1.0683610867659947, + "grad_norm": 0.999165415763855, + "learning_rate": 1.628712866590885e-05, + "loss": 0.9634, + "step": 610 + }, + { + "epoch": 1.077125328659071, + "grad_norm": 0.9724875688552856, + "learning_rate": 1.6207536040388844e-05, + "loss": 0.9559, + "step": 615 + }, + { + "epoch": 1.0858895705521472, + "grad_norm": 0.9775224924087524, + "learning_rate": 1.612729842934718e-05, + "loss": 0.9771, + "step": 620 + }, + { + "epoch": 1.0946538124452234, + "grad_norm": 1.2882146835327148, + "learning_rate": 1.604642416976283e-05, + "loss": 0.9027, + "step": 625 + }, + { + "epoch": 1.1034180543382996, + "grad_norm": 1.0088601112365723, + "learning_rate": 1.596492166476485e-05, + "loss": 0.9494, + "step": 630 + }, + { + "epoch": 1.112182296231376, + "grad_norm": 1.0667091608047485, + "learning_rate": 1.588279938275929e-05, + "loss": 0.9493, + "step": 635 + }, + { + "epoch": 1.1209465381244523, + "grad_norm": 1.0000181198120117, + "learning_rate": 1.580006585654927e-05, + "loss": 0.9609, + "step": 640 + }, + { + "epoch": 1.1297107800175286, + "grad_norm": 0.9999110102653503, + "learning_rate": 1.5716729682448392e-05, + "loss": 1.0068, + "step": 645 + }, + { + "epoch": 1.1384750219106048, + "grad_norm": 1.0657200813293457, + "learning_rate": 1.563279951938758e-05, + "loss": 0.9676, + "step": 650 + }, + { + "epoch": 1.147239263803681, + "grad_norm": 1.029891848564148, + "learning_rate": 1.5548284088015354e-05, + "loss": 0.9623, + "step": 655 + }, + { + "epoch": 1.1560035056967572, + "grad_norm": 1.015758752822876, + "learning_rate": 1.546319216979174e-05, + "loss": 0.9897, + "step": 660 + }, + { + "epoch": 1.1647677475898335, + "grad_norm": 0.9652720093727112, + "learning_rate": 1.537753260607584e-05, + "loss": 0.9607, + "step": 665 + }, + { + "epoch": 1.1735319894829097, + "grad_norm": 1.0845791101455688, + "learning_rate": 1.5291314297207177e-05, + "loss": 0.9783, + "step": 670 + }, + { + "epoch": 1.182296231375986, + "grad_norm": 1.0521138906478882, + "learning_rate": 1.520454620158093e-05, + "loss": 0.9836, + "step": 675 + }, + { + "epoch": 1.1910604732690622, + "grad_norm": 0.9807194471359253, + "learning_rate": 1.5117237334717117e-05, + "loss": 0.9443, + "step": 680 + }, + { + "epoch": 1.1998247151621384, + "grad_norm": 1.0408189296722412, + "learning_rate": 1.5029396768323847e-05, + "loss": 0.9755, + "step": 685 + }, + { + "epoch": 1.2085889570552146, + "grad_norm": 1.0140528678894043, + "learning_rate": 1.4941033629354735e-05, + "loss": 0.942, + "step": 690 + }, + { + "epoch": 1.2173531989482909, + "grad_norm": 1.028287649154663, + "learning_rate": 1.4852157099060595e-05, + "loss": 0.9362, + "step": 695 + }, + { + "epoch": 1.2261174408413673, + "grad_norm": 0.9807888269424438, + "learning_rate": 1.4762776412035455e-05, + "loss": 0.9752, + "step": 700 + }, + { + "epoch": 1.2348816827344435, + "grad_norm": 1.0794785022735596, + "learning_rate": 1.4672900855257056e-05, + "loss": 0.9508, + "step": 705 + }, + { + "epoch": 1.2436459246275198, + "grad_norm": 1.0464166402816772, + "learning_rate": 1.4582539767121904e-05, + "loss": 0.9519, + "step": 710 + }, + { + "epoch": 1.252410166520596, + "grad_norm": 0.9949556589126587, + "learning_rate": 1.449170253647498e-05, + "loss": 0.9188, + "step": 715 + }, + { + "epoch": 1.2611744084136722, + "grad_norm": 0.9590442180633545, + "learning_rate": 1.4400398601634189e-05, + "loss": 0.9686, + "step": 720 + }, + { + "epoch": 1.2699386503067485, + "grad_norm": 1.0098439455032349, + "learning_rate": 1.4308637449409705e-05, + "loss": 0.9848, + "step": 725 + }, + { + "epoch": 1.2787028921998247, + "grad_norm": 1.026219367980957, + "learning_rate": 1.4216428614118245e-05, + "loss": 0.9595, + "step": 730 + }, + { + "epoch": 1.287467134092901, + "grad_norm": 1.0277692079544067, + "learning_rate": 1.4123781676592418e-05, + "loss": 0.9773, + "step": 735 + }, + { + "epoch": 1.2962313759859772, + "grad_norm": 1.0222140550613403, + "learning_rate": 1.4030706263185248e-05, + "loss": 0.9399, + "step": 740 + }, + { + "epoch": 1.3049956178790534, + "grad_norm": 0.9892441630363464, + "learning_rate": 1.3937212044769957e-05, + "loss": 0.985, + "step": 745 + }, + { + "epoch": 1.3137598597721296, + "grad_norm": 1.0329406261444092, + "learning_rate": 1.384330873573513e-05, + "loss": 0.9369, + "step": 750 + }, + { + "epoch": 1.322524101665206, + "grad_norm": 0.9816661477088928, + "learning_rate": 1.3749006092975347e-05, + "loss": 0.9457, + "step": 755 + }, + { + "epoch": 1.331288343558282, + "grad_norm": 1.0054512023925781, + "learning_rate": 1.3654313914877414e-05, + "loss": 0.9087, + "step": 760 + }, + { + "epoch": 1.3400525854513585, + "grad_norm": 17.338027954101562, + "learning_rate": 1.3559242040302274e-05, + "loss": 0.9808, + "step": 765 + }, + { + "epoch": 1.3488168273444348, + "grad_norm": 1.0706207752227783, + "learning_rate": 1.3463800347562705e-05, + "loss": 0.9679, + "step": 770 + }, + { + "epoch": 1.357581069237511, + "grad_norm": 1.040747046470642, + "learning_rate": 1.3367998753396944e-05, + "loss": 0.9974, + "step": 775 + }, + { + "epoch": 1.3663453111305872, + "grad_norm": 0.9935981631278992, + "learning_rate": 1.3271847211938286e-05, + "loss": 0.9428, + "step": 780 + }, + { + "epoch": 1.3751095530236634, + "grad_norm": 1.0025993585586548, + "learning_rate": 1.317535571368082e-05, + "loss": 0.9462, + "step": 785 + }, + { + "epoch": 1.3838737949167397, + "grad_norm": 0.9988533854484558, + "learning_rate": 1.3078534284441382e-05, + "loss": 0.9734, + "step": 790 + }, + { + "epoch": 1.392638036809816, + "grad_norm": 1.0070812702178955, + "learning_rate": 1.2981392984317835e-05, + "loss": 0.9622, + "step": 795 + }, + { + "epoch": 1.4014022787028921, + "grad_norm": 1.0259467363357544, + "learning_rate": 1.2883941906643786e-05, + "loss": 0.9671, + "step": 800 + }, + { + "epoch": 1.4101665205959684, + "grad_norm": 1.0248597860336304, + "learning_rate": 1.2786191176939848e-05, + "loss": 0.9402, + "step": 805 + }, + { + "epoch": 1.4189307624890448, + "grad_norm": 1.008159875869751, + "learning_rate": 1.2688150951861582e-05, + "loss": 1.0111, + "step": 810 + }, + { + "epoch": 1.4276950043821208, + "grad_norm": 1.024697184562683, + "learning_rate": 1.2589831418144156e-05, + "loss": 0.9354, + "step": 815 + }, + { + "epoch": 1.4364592462751973, + "grad_norm": 0.9872326254844666, + "learning_rate": 1.2491242791543922e-05, + "loss": 0.9424, + "step": 820 + }, + { + "epoch": 1.4452234881682735, + "grad_norm": 1.0979632139205933, + "learning_rate": 1.2392395315776964e-05, + "loss": 0.9594, + "step": 825 + }, + { + "epoch": 1.4539877300613497, + "grad_norm": 0.9879066944122314, + "learning_rate": 1.2293299261454726e-05, + "loss": 0.9762, + "step": 830 + }, + { + "epoch": 1.462751971954426, + "grad_norm": 1.278245210647583, + "learning_rate": 1.2193964925016872e-05, + "loss": 0.9458, + "step": 835 + }, + { + "epoch": 1.4715162138475022, + "grad_norm": 1.4075230360031128, + "learning_rate": 1.2094402627661447e-05, + "loss": 0.9496, + "step": 840 + }, + { + "epoch": 1.4802804557405784, + "grad_norm": 1.059368371963501, + "learning_rate": 1.1994622714272448e-05, + "loss": 0.965, + "step": 845 + }, + { + "epoch": 1.4890446976336547, + "grad_norm": 0.9740917086601257, + "learning_rate": 1.1894635552344976e-05, + "loss": 0.939, + "step": 850 + }, + { + "epoch": 1.4978089395267309, + "grad_norm": 0.9713614583015442, + "learning_rate": 1.1794451530908011e-05, + "loss": 0.9256, + "step": 855 + }, + { + "epoch": 1.5065731814198071, + "grad_norm": 1.023720145225525, + "learning_rate": 1.1694081059444947e-05, + "loss": 0.9548, + "step": 860 + }, + { + "epoch": 1.5153374233128836, + "grad_norm": 1.1291546821594238, + "learning_rate": 1.159353456681201e-05, + "loss": 0.9512, + "step": 865 + }, + { + "epoch": 1.5241016652059596, + "grad_norm": 0.9696962833404541, + "learning_rate": 1.1492822500154668e-05, + "loss": 0.9715, + "step": 870 + }, + { + "epoch": 1.532865907099036, + "grad_norm": 1.0114858150482178, + "learning_rate": 1.1391955323822126e-05, + "loss": 0.9355, + "step": 875 + }, + { + "epoch": 1.541630148992112, + "grad_norm": 1.0963616371154785, + "learning_rate": 1.1290943518280058e-05, + "loss": 0.9779, + "step": 880 + }, + { + "epoch": 1.5503943908851885, + "grad_norm": 0.9969412684440613, + "learning_rate": 1.118979757902162e-05, + "loss": 0.9589, + "step": 885 + }, + { + "epoch": 1.5591586327782647, + "grad_norm": 1.022300362586975, + "learning_rate": 1.1088528015476965e-05, + "loss": 0.9656, + "step": 890 + }, + { + "epoch": 1.567922874671341, + "grad_norm": 0.974607527256012, + "learning_rate": 1.098714534992125e-05, + "loss": 0.9622, + "step": 895 + }, + { + "epoch": 1.5766871165644172, + "grad_norm": 1.0116885900497437, + "learning_rate": 1.088566011638134e-05, + "loss": 0.9343, + "step": 900 + }, + { + "epoch": 1.5854513584574934, + "grad_norm": 1.0116138458251953, + "learning_rate": 1.0784082859541291e-05, + "loss": 0.9383, + "step": 905 + }, + { + "epoch": 1.5942156003505696, + "grad_norm": 1.2108194828033447, + "learning_rate": 1.0682424133646712e-05, + "loss": 0.9171, + "step": 910 + }, + { + "epoch": 1.6029798422436459, + "grad_norm": 1.0214340686798096, + "learning_rate": 1.0580694501408138e-05, + "loss": 0.9675, + "step": 915 + }, + { + "epoch": 1.6117440841367223, + "grad_norm": 1.0362666845321655, + "learning_rate": 1.0478904532903535e-05, + "loss": 1.0028, + "step": 920 + }, + { + "epoch": 1.6205083260297983, + "grad_norm": 0.9954794049263, + "learning_rate": 1.0377064804480025e-05, + "loss": 0.9624, + "step": 925 + }, + { + "epoch": 1.6292725679228748, + "grad_norm": 1.0109649896621704, + "learning_rate": 1.0275185897654972e-05, + "loss": 0.9501, + "step": 930 + }, + { + "epoch": 1.6380368098159508, + "grad_norm": 1.0172914266586304, + "learning_rate": 1.0173278398016502e-05, + "loss": 0.9354, + "step": 935 + }, + { + "epoch": 1.6468010517090272, + "grad_norm": 0.9905017614364624, + "learning_rate": 1.0071352894123654e-05, + "loss": 0.9758, + "step": 940 + }, + { + "epoch": 1.6555652936021035, + "grad_norm": 0.9832938313484192, + "learning_rate": 9.969419976406166e-06, + "loss": 0.9737, + "step": 945 + }, + { + "epoch": 1.6643295354951797, + "grad_norm": 0.9569029808044434, + "learning_rate": 9.867490236064109e-06, + "loss": 0.9212, + "step": 950 + }, + { + "epoch": 1.673093777388256, + "grad_norm": 1.0192569494247437, + "learning_rate": 9.765574263967397e-06, + "loss": 0.9472, + "step": 955 + }, + { + "epoch": 1.6818580192813322, + "grad_norm": 0.9713300466537476, + "learning_rate": 9.663682649555389e-06, + "loss": 0.9644, + "step": 960 + }, + { + "epoch": 1.6906222611744084, + "grad_norm": 0.9462825655937195, + "learning_rate": 9.56182597973658e-06, + "loss": 0.9576, + "step": 965 + }, + { + "epoch": 1.6993865030674846, + "grad_norm": 0.9868680834770203, + "learning_rate": 9.460014837788605e-06, + "loss": 0.9667, + "step": 970 + }, + { + "epoch": 1.708150744960561, + "grad_norm": 1.0376570224761963, + "learning_rate": 9.358259802258582e-06, + "loss": 0.9452, + "step": 975 + }, + { + "epoch": 1.716914986853637, + "grad_norm": 1.0066869258880615, + "learning_rate": 9.256571445863972e-06, + "loss": 0.9534, + "step": 980 + }, + { + "epoch": 1.7256792287467135, + "grad_norm": 1.0090934038162231, + "learning_rate": 9.154960334394027e-06, + "loss": 0.955, + "step": 985 + }, + { + "epoch": 1.7344434706397895, + "grad_norm": 0.9518396854400635, + "learning_rate": 9.053437025611974e-06, + "loss": 0.9342, + "step": 990 + }, + { + "epoch": 1.743207712532866, + "grad_norm": 0.9992371797561646, + "learning_rate": 8.952012068158027e-06, + "loss": 0.9722, + "step": 995 + }, + { + "epoch": 1.751971954425942, + "grad_norm": 0.9554047584533691, + "learning_rate": 8.850696000453327e-06, + "loss": 0.9357, + "step": 1000 + }, + { + "epoch": 1.7607361963190185, + "grad_norm": 0.9989141225814819, + "learning_rate": 8.749499349604992e-06, + "loss": 0.9821, + "step": 1005 + }, + { + "epoch": 1.7695004382120947, + "grad_norm": 0.9504846334457397, + "learning_rate": 8.64843263031228e-06, + "loss": 0.9537, + "step": 1010 + }, + { + "epoch": 1.778264680105171, + "grad_norm": 0.9558340907096863, + "learning_rate": 8.547506343774097e-06, + "loss": 0.9289, + "step": 1015 + }, + { + "epoch": 1.7870289219982471, + "grad_norm": 1.0170910358428955, + "learning_rate": 8.446730976597877e-06, + "loss": 0.9501, + "step": 1020 + }, + { + "epoch": 1.7957931638913234, + "grad_norm": 0.9939414262771606, + "learning_rate": 8.346116999709975e-06, + "loss": 0.9472, + "step": 1025 + }, + { + "epoch": 1.8045574057843996, + "grad_norm": 0.9810356497764587, + "learning_rate": 8.245674867267724e-06, + "loss": 0.9491, + "step": 1030 + }, + { + "epoch": 1.8133216476774758, + "grad_norm": 0.9643825888633728, + "learning_rate": 8.145415015573183e-06, + "loss": 0.947, + "step": 1035 + }, + { + "epoch": 1.8220858895705523, + "grad_norm": 0.9195330739021301, + "learning_rate": 8.045347861988789e-06, + "loss": 0.876, + "step": 1040 + }, + { + "epoch": 1.8308501314636283, + "grad_norm": 0.9632524847984314, + "learning_rate": 7.945483803854937e-06, + "loss": 0.9173, + "step": 1045 + }, + { + "epoch": 1.8396143733567047, + "grad_norm": 0.9642296433448792, + "learning_rate": 7.845833217409677e-06, + "loss": 0.9233, + "step": 1050 + }, + { + "epoch": 1.8483786152497808, + "grad_norm": 0.9421396851539612, + "learning_rate": 7.746406456710564e-06, + "loss": 0.9187, + "step": 1055 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.9888685345649719, + "learning_rate": 7.64721385255886e-06, + "loss": 0.9289, + "step": 1060 + }, + { + "epoch": 1.8659070990359334, + "grad_norm": 0.9585088491439819, + "learning_rate": 7.548265711426105e-06, + "loss": 0.9291, + "step": 1065 + }, + { + "epoch": 1.8746713409290097, + "grad_norm": 0.9842194318771362, + "learning_rate": 7.449572314383237e-06, + "loss": 0.9521, + "step": 1070 + }, + { + "epoch": 1.883435582822086, + "grad_norm": 0.9899460077285767, + "learning_rate": 7.351143916032375e-06, + "loss": 0.9238, + "step": 1075 + }, + { + "epoch": 1.8921998247151621, + "grad_norm": 1.044392466545105, + "learning_rate": 7.252990743441293e-06, + "loss": 0.9354, + "step": 1080 + }, + { + "epoch": 1.9009640666082384, + "grad_norm": 0.9790138006210327, + "learning_rate": 7.155122995080826e-06, + "loss": 0.9527, + "step": 1085 + }, + { + "epoch": 1.9097283085013146, + "grad_norm": 1.002880334854126, + "learning_rate": 7.0575508397651885e-06, + "loss": 0.9471, + "step": 1090 + }, + { + "epoch": 1.918492550394391, + "grad_norm": 0.9698459506034851, + "learning_rate": 6.960284415595407e-06, + "loss": 0.9402, + "step": 1095 + }, + { + "epoch": 1.927256792287467, + "grad_norm": 0.9467353224754333, + "learning_rate": 6.863333828905929e-06, + "loss": 0.9409, + "step": 1100 + }, + { + "epoch": 1.9360210341805435, + "grad_norm": 0.965829074382782, + "learning_rate": 6.766709153214541e-06, + "loss": 0.9425, + "step": 1105 + }, + { + "epoch": 1.9447852760736195, + "grad_norm": 0.9571474194526672, + "learning_rate": 6.670420428175706e-06, + "loss": 0.9405, + "step": 1110 + }, + { + "epoch": 1.953549517966696, + "grad_norm": 0.9341493248939514, + "learning_rate": 6.574477658537375e-06, + "loss": 0.9145, + "step": 1115 + }, + { + "epoch": 1.962313759859772, + "grad_norm": 0.9990600943565369, + "learning_rate": 6.4788908131014995e-06, + "loss": 0.952, + "step": 1120 + }, + { + "epoch": 1.9710780017528484, + "grad_norm": 0.917290210723877, + "learning_rate": 6.383669823688191e-06, + "loss": 0.951, + "step": 1125 + }, + { + "epoch": 1.9798422436459246, + "grad_norm": 0.9599776864051819, + "learning_rate": 6.288824584103815e-06, + "loss": 0.936, + "step": 1130 + }, + { + "epoch": 1.9886064855390009, + "grad_norm": 0.9636255502700806, + "learning_rate": 6.194364949112952e-06, + "loss": 0.9582, + "step": 1135 + }, + { + "epoch": 1.997370727432077, + "grad_norm": 1.1487308740615845, + "learning_rate": 6.100300733414473e-06, + "loss": 0.9276, + "step": 1140 + }, + { + "epoch": 2.0, + "eval_loss": 1.151129961013794, + "eval_runtime": 199.4284, + "eval_samples_per_second": 9.161, + "eval_steps_per_second": 2.292, + "step": 1142 + }, + { + "epoch": 2.005258545135846, + "grad_norm": 1.1605374813079834, + "learning_rate": 6.006641710621746e-06, + "loss": 0.8479, + "step": 1145 + }, + { + "epoch": 2.014022787028922, + "grad_norm": 1.0491231679916382, + "learning_rate": 5.913397612247121e-06, + "loss": 0.8032, + "step": 1150 + }, + { + "epoch": 2.0227870289219982, + "grad_norm": 1.0855581760406494, + "learning_rate": 5.82057812669081e-06, + "loss": 0.8839, + "step": 1155 + }, + { + "epoch": 2.0315512708150747, + "grad_norm": 0.9942172169685364, + "learning_rate": 5.728192898234195e-06, + "loss": 0.7986, + "step": 1160 + }, + { + "epoch": 2.0403155127081507, + "grad_norm": 1.0435779094696045, + "learning_rate": 5.636251526037784e-06, + "loss": 0.8263, + "step": 1165 + }, + { + "epoch": 2.049079754601227, + "grad_norm": 1.0303524732589722, + "learning_rate": 5.544763563143794e-06, + "loss": 0.8188, + "step": 1170 + }, + { + "epoch": 2.057843996494303, + "grad_norm": 0.9739100933074951, + "learning_rate": 5.453738515483586e-06, + "loss": 0.8488, + "step": 1175 + }, + { + "epoch": 2.0666082383873796, + "grad_norm": 1.021791696548462, + "learning_rate": 5.363185840889935e-06, + "loss": 0.8666, + "step": 1180 + }, + { + "epoch": 2.0753724802804556, + "grad_norm": 0.9683573842048645, + "learning_rate": 5.273114948114346e-06, + "loss": 0.8276, + "step": 1185 + }, + { + "epoch": 2.084136722173532, + "grad_norm": 1.0052560567855835, + "learning_rate": 5.1835351958494515e-06, + "loss": 0.8089, + "step": 1190 + }, + { + "epoch": 2.092900964066608, + "grad_norm": 0.9584820866584778, + "learning_rate": 5.094455891756587e-06, + "loss": 0.8276, + "step": 1195 + }, + { + "epoch": 2.1016652059596845, + "grad_norm": 0.9803566932678223, + "learning_rate": 5.0058862914987204e-06, + "loss": 0.8256, + "step": 1200 + }, + { + "epoch": 2.1104294478527605, + "grad_norm": 0.9923965334892273, + "learning_rate": 4.917835597778731e-06, + "loss": 0.8241, + "step": 1205 + }, + { + "epoch": 2.119193689745837, + "grad_norm": 1.022495985031128, + "learning_rate": 4.830312959383238e-06, + "loss": 0.8074, + "step": 1210 + }, + { + "epoch": 2.127957931638913, + "grad_norm": 0.9760512709617615, + "learning_rate": 4.743327470231982e-06, + "loss": 0.8058, + "step": 1215 + }, + { + "epoch": 2.1367221735319895, + "grad_norm": 0.9603386521339417, + "learning_rate": 4.656888168432962e-06, + "loss": 0.8133, + "step": 1220 + }, + { + "epoch": 2.145486415425066, + "grad_norm": 1.034776210784912, + "learning_rate": 4.571004035343315e-06, + "loss": 0.818, + "step": 1225 + }, + { + "epoch": 2.154250657318142, + "grad_norm": 0.9763988256454468, + "learning_rate": 4.485683994636144e-06, + "loss": 0.8165, + "step": 1230 + }, + { + "epoch": 2.1630148992112184, + "grad_norm": 0.9729757905006409, + "learning_rate": 4.400936911373308e-06, + "loss": 0.808, + "step": 1235 + }, + { + "epoch": 2.1717791411042944, + "grad_norm": 1.0068873167037964, + "learning_rate": 4.316771591084297e-06, + "loss": 0.8038, + "step": 1240 + }, + { + "epoch": 2.180543382997371, + "grad_norm": 0.9344819188117981, + "learning_rate": 4.2331967788513295e-06, + "loss": 0.8335, + "step": 1245 + }, + { + "epoch": 2.189307624890447, + "grad_norm": 1.0315194129943848, + "learning_rate": 4.150221158400683e-06, + "loss": 0.8154, + "step": 1250 + }, + { + "epoch": 2.1980718667835233, + "grad_norm": 0.9959366321563721, + "learning_rate": 4.067853351200446e-06, + "loss": 0.8317, + "step": 1255 + }, + { + "epoch": 2.2068361086765993, + "grad_norm": 1.0919640064239502, + "learning_rate": 3.986101915564695e-06, + "loss": 0.8236, + "step": 1260 + }, + { + "epoch": 2.2156003505696757, + "grad_norm": 0.9548513293266296, + "learning_rate": 3.904975345764262e-06, + "loss": 0.849, + "step": 1265 + }, + { + "epoch": 2.224364592462752, + "grad_norm": 0.9864785075187683, + "learning_rate": 3.824482071144164e-06, + "loss": 0.8259, + "step": 1270 + }, + { + "epoch": 2.233128834355828, + "grad_norm": 1.014013648033142, + "learning_rate": 3.7446304552477387e-06, + "loss": 0.7696, + "step": 1275 + }, + { + "epoch": 2.2418930762489047, + "grad_norm": 0.95964115858078, + "learning_rate": 3.665428794947663e-06, + "loss": 0.7758, + "step": 1280 + }, + { + "epoch": 2.2506573181419807, + "grad_norm": 0.9974411725997925, + "learning_rate": 3.5868853195838582e-06, + "loss": 0.8512, + "step": 1285 + }, + { + "epoch": 2.259421560035057, + "grad_norm": 0.990260124206543, + "learning_rate": 3.509008190108453e-06, + "loss": 0.8096, + "step": 1290 + }, + { + "epoch": 2.268185801928133, + "grad_norm": 0.982060968875885, + "learning_rate": 3.431805498237808e-06, + "loss": 0.8259, + "step": 1295 + }, + { + "epoch": 2.2769500438212096, + "grad_norm": 0.9737572073936462, + "learning_rate": 3.355285265611784e-06, + "loss": 0.8368, + "step": 1300 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.9657383561134338, + "learning_rate": 3.2794554429602377e-06, + "loss": 0.8129, + "step": 1305 + }, + { + "epoch": 2.294478527607362, + "grad_norm": 0.9947619438171387, + "learning_rate": 3.204323909276924e-06, + "loss": 0.8034, + "step": 1310 + }, + { + "epoch": 2.303242769500438, + "grad_norm": 1.0247445106506348, + "learning_rate": 3.1298984710008483e-06, + "loss": 0.8267, + "step": 1315 + }, + { + "epoch": 2.3120070113935145, + "grad_norm": 0.9986540079116821, + "learning_rate": 3.056186861205136e-06, + "loss": 0.8233, + "step": 1320 + }, + { + "epoch": 2.3207712532865905, + "grad_norm": 0.9882351160049438, + "learning_rate": 2.983196738793547e-06, + "loss": 0.8097, + "step": 1325 + }, + { + "epoch": 2.329535495179667, + "grad_norm": 0.9737289547920227, + "learning_rate": 2.910935687704671e-06, + "loss": 0.8285, + "step": 1330 + }, + { + "epoch": 2.3382997370727434, + "grad_norm": 0.9512819647789001, + "learning_rate": 2.8394112161239606e-06, + "loss": 0.7998, + "step": 1335 + }, + { + "epoch": 2.3470639789658194, + "grad_norm": 0.980267345905304, + "learning_rate": 2.7686307557035684e-06, + "loss": 0.8364, + "step": 1340 + }, + { + "epoch": 2.355828220858896, + "grad_norm": 0.9798904061317444, + "learning_rate": 2.698601660790191e-06, + "loss": 0.8288, + "step": 1345 + }, + { + "epoch": 2.364592462751972, + "grad_norm": 0.9910169839859009, + "learning_rate": 2.629331207660931e-06, + "loss": 0.8182, + "step": 1350 + }, + { + "epoch": 2.3733567046450483, + "grad_norm": 1.0095982551574707, + "learning_rate": 2.560826593767244e-06, + "loss": 0.8651, + "step": 1355 + }, + { + "epoch": 2.3821209465381243, + "grad_norm": 1.0415823459625244, + "learning_rate": 2.4930949369871205e-06, + "loss": 0.7934, + "step": 1360 + }, + { + "epoch": 2.390885188431201, + "grad_norm": 0.9959484934806824, + "learning_rate": 2.426143274885493e-06, + "loss": 0.8131, + "step": 1365 + }, + { + "epoch": 2.399649430324277, + "grad_norm": 0.9777078628540039, + "learning_rate": 2.359978563983022e-06, + "loss": 0.8125, + "step": 1370 + }, + { + "epoch": 2.4084136722173533, + "grad_norm": 1.0206762552261353, + "learning_rate": 2.294607679033283e-06, + "loss": 0.7912, + "step": 1375 + }, + { + "epoch": 2.4171779141104293, + "grad_norm": 0.9738752245903015, + "learning_rate": 2.230037412308452e-06, + "loss": 0.8411, + "step": 1380 + }, + { + "epoch": 2.4259421560035057, + "grad_norm": 0.9954826831817627, + "learning_rate": 2.166274472893567e-06, + "loss": 0.8052, + "step": 1385 + }, + { + "epoch": 2.4347063978965817, + "grad_norm": 0.9861373901367188, + "learning_rate": 2.1033254859894224e-06, + "loss": 0.8041, + "step": 1390 + }, + { + "epoch": 2.443470639789658, + "grad_norm": 0.9600276947021484, + "learning_rate": 2.041196992224206e-06, + "loss": 0.8326, + "step": 1395 + }, + { + "epoch": 2.4522348816827346, + "grad_norm": 1.127557396888733, + "learning_rate": 1.9798954469738762e-06, + "loss": 0.8355, + "step": 1400 + }, + { + "epoch": 2.4609991235758106, + "grad_norm": 0.9988298416137695, + "learning_rate": 1.9194272196914533e-06, + "loss": 0.8473, + "step": 1405 + }, + { + "epoch": 2.469763365468887, + "grad_norm": 0.972212553024292, + "learning_rate": 1.8597985932451856e-06, + "loss": 0.816, + "step": 1410 + }, + { + "epoch": 2.478527607361963, + "grad_norm": 0.9716165065765381, + "learning_rate": 1.8010157632657544e-06, + "loss": 0.8157, + "step": 1415 + }, + { + "epoch": 2.4872918492550395, + "grad_norm": 0.9722920656204224, + "learning_rate": 1.7430848375025178e-06, + "loss": 0.8238, + "step": 1420 + }, + { + "epoch": 2.4960560911481156, + "grad_norm": 1.0044946670532227, + "learning_rate": 1.686011835188891e-06, + "loss": 0.8473, + "step": 1425 + }, + { + "epoch": 2.504820333041192, + "grad_norm": 0.9682095050811768, + "learning_rate": 1.6298026864169336e-06, + "loss": 0.8132, + "step": 1430 + }, + { + "epoch": 2.513584574934268, + "grad_norm": 0.9928619861602783, + "learning_rate": 1.5744632315211815e-06, + "loss": 0.837, + "step": 1435 + }, + { + "epoch": 2.5223488168273445, + "grad_norm": 0.9613544344902039, + "learning_rate": 1.5199992204718295e-06, + "loss": 0.8209, + "step": 1440 + }, + { + "epoch": 2.531113058720421, + "grad_norm": 1.0032097101211548, + "learning_rate": 1.466416312277269e-06, + "loss": 0.8303, + "step": 1445 + }, + { + "epoch": 2.539877300613497, + "grad_norm": 0.9630649089813232, + "learning_rate": 1.4137200743961189e-06, + "loss": 0.825, + "step": 1450 + }, + { + "epoch": 2.548641542506573, + "grad_norm": 0.9702491164207458, + "learning_rate": 1.3619159821587236e-06, + "loss": 0.8148, + "step": 1455 + }, + { + "epoch": 2.5574057843996494, + "grad_norm": 0.9509206414222717, + "learning_rate": 1.3110094181982657e-06, + "loss": 0.7695, + "step": 1460 + }, + { + "epoch": 2.566170026292726, + "grad_norm": 0.9589338302612305, + "learning_rate": 1.261005671891482e-06, + "loss": 0.8532, + "step": 1465 + }, + { + "epoch": 2.574934268185802, + "grad_norm": 0.9704285264015198, + "learning_rate": 1.2119099388090715e-06, + "loss": 0.797, + "step": 1470 + }, + { + "epoch": 2.5836985100788783, + "grad_norm": 1.0093833208084106, + "learning_rate": 1.1637273201758747e-06, + "loss": 0.8233, + "step": 1475 + }, + { + "epoch": 2.5924627519719543, + "grad_norm": 0.9612089991569519, + "learning_rate": 1.1164628223408169e-06, + "loss": 0.8489, + "step": 1480 + }, + { + "epoch": 2.6012269938650308, + "grad_norm": 0.9347235560417175, + "learning_rate": 1.0701213562567491e-06, + "loss": 0.7855, + "step": 1485 + }, + { + "epoch": 2.6099912357581068, + "grad_norm": 1.00240957736969, + "learning_rate": 1.0247077369701653e-06, + "loss": 0.8322, + "step": 1490 + }, + { + "epoch": 2.618755477651183, + "grad_norm": 0.99866783618927, + "learning_rate": 9.802266831209206e-07, + "loss": 0.8133, + "step": 1495 + }, + { + "epoch": 2.6275197195442592, + "grad_norm": 1.0041725635528564, + "learning_rate": 9.36682816451926e-07, + "loss": 0.8715, + "step": 1500 + }, + { + "epoch": 2.6362839614373357, + "grad_norm": 0.9615875482559204, + "learning_rate": 8.940806613289499e-07, + "loss": 0.8075, + "step": 1505 + }, + { + "epoch": 2.645048203330412, + "grad_norm": 0.9449265003204346, + "learning_rate": 8.524246442705153e-07, + "loss": 0.7974, + "step": 1510 + }, + { + "epoch": 2.653812445223488, + "grad_norm": 0.9578828811645508, + "learning_rate": 8.117190934879593e-07, + "loss": 0.8175, + "step": 1515 + }, + { + "epoch": 2.662576687116564, + "grad_norm": 0.9990285038948059, + "learning_rate": 7.719682384357308e-07, + "loss": 0.8147, + "step": 1520 + }, + { + "epoch": 2.6713409290096406, + "grad_norm": 0.9652912616729736, + "learning_rate": 7.33176209371923e-07, + "loss": 0.8429, + "step": 1525 + }, + { + "epoch": 2.680105170902717, + "grad_norm": 0.9373207092285156, + "learning_rate": 6.953470369291349e-07, + "loss": 0.825, + "step": 1530 + }, + { + "epoch": 2.688869412795793, + "grad_norm": 0.9682218432426453, + "learning_rate": 6.5848465169566e-07, + "loss": 0.7916, + "step": 1535 + }, + { + "epoch": 2.6976336546888695, + "grad_norm": 0.995035707950592, + "learning_rate": 6.225928838071016e-07, + "loss": 0.829, + "step": 1540 + }, + { + "epoch": 2.7063978965819455, + "grad_norm": 0.9676108956336975, + "learning_rate": 5.876754625483904e-07, + "loss": 0.8497, + "step": 1545 + }, + { + "epoch": 2.715162138475022, + "grad_norm": 0.9674281477928162, + "learning_rate": 5.537360159663107e-07, + "loss": 0.8126, + "step": 1550 + }, + { + "epoch": 2.7239263803680984, + "grad_norm": 0.9768509864807129, + "learning_rate": 5.207780704925314e-07, + "loss": 0.8432, + "step": 1555 + }, + { + "epoch": 2.7326906222611744, + "grad_norm": 0.9932735562324524, + "learning_rate": 4.888050505771869e-07, + "loss": 0.8293, + "step": 1560 + }, + { + "epoch": 2.7414548641542504, + "grad_norm": 0.9800174832344055, + "learning_rate": 4.5782027833307983e-07, + "loss": 0.7843, + "step": 1565 + }, + { + "epoch": 2.750219106047327, + "grad_norm": 0.9393450021743774, + "learning_rate": 4.2782697319048603e-07, + "loss": 0.8016, + "step": 1570 + }, + { + "epoch": 2.7589833479404033, + "grad_norm": 0.9714465737342834, + "learning_rate": 3.9882825156265846e-07, + "loss": 0.8264, + "step": 1575 + }, + { + "epoch": 2.7677475898334793, + "grad_norm": 0.975568950176239, + "learning_rate": 3.708271265220087e-07, + "loss": 0.802, + "step": 1580 + }, + { + "epoch": 2.776511831726556, + "grad_norm": 0.9788158535957336, + "learning_rate": 3.4382650748704173e-07, + "loss": 0.8374, + "step": 1585 + }, + { + "epoch": 2.785276073619632, + "grad_norm": 0.9417116641998291, + "learning_rate": 3.178291999200633e-07, + "loss": 0.8181, + "step": 1590 + }, + { + "epoch": 2.7940403155127083, + "grad_norm": 0.9802819490432739, + "learning_rate": 2.928379050356722e-07, + "loss": 0.8208, + "step": 1595 + }, + { + "epoch": 2.8028045574057843, + "grad_norm": 0.9727985858917236, + "learning_rate": 2.6885521952010105e-07, + "loss": 0.7862, + "step": 1600 + }, + { + "epoch": 2.8115687992988607, + "grad_norm": 0.9225666522979736, + "learning_rate": 2.458836352614069e-07, + "loss": 0.7791, + "step": 1605 + }, + { + "epoch": 2.8203330411919367, + "grad_norm": 1.038718342781067, + "learning_rate": 2.2392553909055813e-07, + "loss": 0.8164, + "step": 1610 + }, + { + "epoch": 2.829097283085013, + "grad_norm": 0.945773184299469, + "learning_rate": 2.029832125334319e-07, + "loss": 0.8277, + "step": 1615 + }, + { + "epoch": 2.8378615249780896, + "grad_norm": 0.9560094475746155, + "learning_rate": 1.8305883157375804e-07, + "loss": 0.7974, + "step": 1620 + }, + { + "epoch": 2.8466257668711656, + "grad_norm": 0.9896951913833618, + "learning_rate": 1.6415446642702337e-07, + "loss": 0.8084, + "step": 1625 + }, + { + "epoch": 2.8553900087642416, + "grad_norm": 0.9845879077911377, + "learning_rate": 1.4627208132536818e-07, + "loss": 0.8216, + "step": 1630 + }, + { + "epoch": 2.864154250657318, + "grad_norm": 0.9730380177497864, + "learning_rate": 1.2941353431350058e-07, + "loss": 0.7997, + "step": 1635 + }, + { + "epoch": 2.8729184925503946, + "grad_norm": 0.9810739159584045, + "learning_rate": 1.1358057705563641e-07, + "loss": 0.8212, + "step": 1640 + }, + { + "epoch": 2.8816827344434706, + "grad_norm": 0.9314019083976746, + "learning_rate": 9.877485465349057e-08, + "loss": 0.7794, + "step": 1645 + }, + { + "epoch": 2.890446976336547, + "grad_norm": 0.9651502966880798, + "learning_rate": 8.499790547535025e-08, + "loss": 0.8138, + "step": 1650 + }, + { + "epoch": 2.899211218229623, + "grad_norm": 0.966038167476654, + "learning_rate": 7.225116099623287e-08, + "loss": 0.8212, + "step": 1655 + }, + { + "epoch": 2.9079754601226995, + "grad_norm": 0.9493021965026855, + "learning_rate": 6.053594564914611e-08, + "loss": 0.832, + "step": 1660 + }, + { + "epoch": 2.9167397020157755, + "grad_norm": 0.9688047766685486, + "learning_rate": 4.985347668747809e-08, + "loss": 0.8239, + "step": 1665 + }, + { + "epoch": 2.925503943908852, + "grad_norm": 0.9778699278831482, + "learning_rate": 4.020486405852286e-08, + "loss": 0.7976, + "step": 1670 + }, + { + "epoch": 2.934268185801928, + "grad_norm": 0.9479379653930664, + "learning_rate": 3.15911102881461e-08, + "loss": 0.8375, + "step": 1675 + }, + { + "epoch": 2.9430324276950044, + "grad_norm": 1.0030702352523804, + "learning_rate": 2.4013110376623906e-08, + "loss": 0.8225, + "step": 1680 + }, + { + "epoch": 2.951796669588081, + "grad_norm": 1.0119658708572388, + "learning_rate": 1.747165170564724e-08, + "loss": 0.8276, + "step": 1685 + }, + { + "epoch": 2.960560911481157, + "grad_norm": 0.9981706738471985, + "learning_rate": 1.1967413956510687e-08, + "loss": 0.8661, + "step": 1690 + }, + { + "epoch": 2.969325153374233, + "grad_norm": 0.9298052787780762, + "learning_rate": 7.500969039491156e-09, + "loss": 0.8439, + "step": 1695 + }, + { + "epoch": 2.9780893952673093, + "grad_norm": 0.9936395287513733, + "learning_rate": 4.072781034425432e-09, + "loss": 0.8221, + "step": 1700 + }, + { + "epoch": 2.9868536371603858, + "grad_norm": 1.0040860176086426, + "learning_rate": 1.6832061424865155e-09, + "loss": 0.818, + "step": 1705 + }, + { + "epoch": 2.9956178790534618, + "grad_norm": 0.9950876235961914, + "learning_rate": 3.324926491787839e-10, + "loss": 0.8279, + "step": 1710 + }, + { + "epoch": 3.0, + "eval_loss": 1.1766911745071411, + "eval_runtime": 199.3045, + "eval_samples_per_second": 9.167, + "eval_steps_per_second": 2.293, + "step": 1713 + }, + { + "epoch": 3.0, + "step": 1713, + "total_flos": 90953314467840.0, + "train_loss": 0.9790556504583052, + "train_runtime": 12705.6497, + "train_samples_per_second": 8.618, + "train_steps_per_second": 0.135 + } + ], + "logging_steps": 5, + "max_steps": 1713, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 90953314467840.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..51f5370 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:834f99a8a19a900dffb91f954b02371035fe8e1052d95a3792f28fe5c1c60d03 +size 7313