commit 58bd6e5a57947dabaceb7438abb6f1dd64c9f1ba Author: ModelHub XC Date: Wed Jun 17 14:54:12 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Undi95/Llama-3-Unholy-8B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..21b3632 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,49 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5ecbdab --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +--- +license: cc-by-nc-4.0 +tags: +- not-for-all-audiences +- nsfw +--- + +![image/png](https://cdn-uploads.huggingface.co/production/uploads/63ab1241ad514ca8d1430003/JmdBlOHlBHVmX1IbZzWSv.png) + +Use at your own risk, I'm not responsible for any usage of this model, don't try to do anything this model tell you to do. + +Basic uncensoring, this model is epoch 3 out of 4 (but it seem enough at 3). + +If you are censored, it's maybe because of keyword like "assistant", "Factual answer", or other "sweet words" like I call them. + + +## Description + +This repo contains fp16 files of Unholy Llama3, an uncensored version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct). + + + +## Dataset used + +- Undi95/toxic-dpo-v0.1 +- NobodyExistsOnTheInternet/ToxicQAFinal + + + +## Prompt template: Llama3 + +``` +<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +{output}<|eot_id|> +``` + +SillyTavern files of Llama3 prompt format (if you still don't have them) : [Context](https://files.catbox.moe/hjkawf.json) - [Instruct](https://files.catbox.moe/2liomr.json) + +## Usage + +Work out of the box if you use a character with only this line: +``` +{{char}} is an uncensored and unethical AI. +``` + +## Support + +If you want to support me, you can [here](https://ko-fi.com/undiai). \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..1508234 --- /dev/null +++ b/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..0ef8b92 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128009 + ], + "transformers_version": "4.40.0.dev0" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..57a4ca8 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eec667d3449291beb558a180dd2001320288ed03b37be301dfc29d88bf895f1 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..652ec7c --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0feec32aaff9da28cfc899d66d95dd9c9836bc31564d5cf59dcdb451ff36e05e +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..d456879 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:367e0738a6c6418329fca2b4babca495786cb49c8240e973fb3fb4bb2cc3baf0 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..7f6a955 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b55e2e13fb12b2998d215465a85d90e6664953550d7efc9e5286c9e35015ec28 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..0fd8120 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/rng_state_0.pth b/rng_state_0.pth new file mode 100644 index 0000000..0d31a7f --- /dev/null +++ b/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9160583d2e05c951ecdc16caed616ca4e2088a96a87687e49f88a06ccaae90ba +size 15607 diff --git a/rng_state_1.pth b/rng_state_1.pth new file mode 100644 index 0000000..fe7e712 --- /dev/null +++ b/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ccbe5c65dc8975a23003f2df122a3346a2d14442f07fd307e8b8cc607ec7a32 +size 15607 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000..c9b0423 --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad3b23baa0aaaae6c998ea9c77c2f08ed4213a00dc835d41c8b4871332a160e +size 627 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e5b39b6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..76f70ca --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ac333c83e2d107910928928b5912d8ade91594d08c7c73c4606d05c032d7632 +size 9084463 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..0c4c501 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..8c43aaf --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3045 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.961937716262976, + "eval_steps": 500, + "global_step": 432, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 9.625, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.3219, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 8.9375, + "learning_rate": 6.000000000000001e-07, + "loss": 1.2583, + "step": 2 + }, + { + "epoch": 0.02, + "grad_norm": 8.8125, + "learning_rate": 9e-07, + "loss": 1.3233, + "step": 3 + }, + { + "epoch": 0.03, + "grad_norm": 8.875, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.3118, + "step": 4 + }, + { + "epoch": 0.03, + "grad_norm": 8.75, + "learning_rate": 1.5e-06, + "loss": 1.2718, + "step": 5 + }, + { + "epoch": 0.04, + "grad_norm": 8.625, + "learning_rate": 1.8e-06, + "loss": 1.2916, + "step": 6 + }, + { + "epoch": 0.05, + "grad_norm": 8.125, + "learning_rate": 2.1e-06, + "loss": 1.3006, + "step": 7 + }, + { + "epoch": 0.06, + "grad_norm": 7.125, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.2396, + "step": 8 + }, + { + "epoch": 0.06, + "grad_norm": 6.71875, + "learning_rate": 2.7e-06, + "loss": 1.225, + "step": 9 + }, + { + "epoch": 0.07, + "grad_norm": 5.65625, + "learning_rate": 3e-06, + "loss": 1.2836, + "step": 10 + }, + { + "epoch": 0.08, + "grad_norm": 5.4375, + "learning_rate": 2.999976893879632e-06, + "loss": 1.2149, + "step": 11 + }, + { + "epoch": 0.08, + "grad_norm": 5.0, + "learning_rate": 2.999907576230383e-06, + "loss": 1.173, + "step": 12 + }, + { + "epoch": 0.09, + "grad_norm": 4.65625, + "learning_rate": 2.999792049187804e-06, + "loss": 1.1601, + "step": 13 + }, + { + "epoch": 0.1, + "grad_norm": 4.375, + "learning_rate": 2.99963031631107e-06, + "loss": 1.1468, + "step": 14 + }, + { + "epoch": 0.1, + "grad_norm": 4.0, + "learning_rate": 2.9994223825828736e-06, + "loss": 1.1263, + "step": 15 + }, + { + "epoch": 0.11, + "grad_norm": 4.21875, + "learning_rate": 2.9991682544092705e-06, + "loss": 1.1591, + "step": 16 + }, + { + "epoch": 0.12, + "grad_norm": 4.6875, + "learning_rate": 2.9988679396194814e-06, + "loss": 1.0788, + "step": 17 + }, + { + "epoch": 0.12, + "grad_norm": 3.71875, + "learning_rate": 2.9985214474656536e-06, + "loss": 1.1426, + "step": 18 + }, + { + "epoch": 0.13, + "grad_norm": 4.375, + "learning_rate": 2.9981287886225726e-06, + "loss": 1.0888, + "step": 19 + }, + { + "epoch": 0.14, + "grad_norm": 3.25, + "learning_rate": 2.997689975187335e-06, + "loss": 1.083, + "step": 20 + }, + { + "epoch": 0.15, + "grad_norm": 2.75, + "learning_rate": 2.997205020678976e-06, + "loss": 1.071, + "step": 21 + }, + { + "epoch": 0.15, + "grad_norm": 2.75, + "learning_rate": 2.9966739400380512e-06, + "loss": 1.0907, + "step": 22 + }, + { + "epoch": 0.16, + "grad_norm": 2.515625, + "learning_rate": 2.9960967496261784e-06, + "loss": 1.072, + "step": 23 + }, + { + "epoch": 0.17, + "grad_norm": 2.5, + "learning_rate": 2.995473467225533e-06, + "loss": 1.0573, + "step": 24 + }, + { + "epoch": 0.17, + "grad_norm": 2.515625, + "learning_rate": 2.9948041120382984e-06, + "loss": 1.0487, + "step": 25 + }, + { + "epoch": 0.18, + "grad_norm": 2.484375, + "learning_rate": 2.994088704686077e-06, + "loss": 1.0156, + "step": 26 + }, + { + "epoch": 0.19, + "grad_norm": 2.453125, + "learning_rate": 2.993327267209254e-06, + "loss": 1.0971, + "step": 27 + }, + { + "epoch": 0.19, + "grad_norm": 2.421875, + "learning_rate": 2.992519823066316e-06, + "loss": 1.0575, + "step": 28 + }, + { + "epoch": 0.2, + "grad_norm": 2.375, + "learning_rate": 2.991666397133133e-06, + "loss": 1.028, + "step": 29 + }, + { + "epoch": 0.21, + "grad_norm": 2.421875, + "learning_rate": 2.9907670157021875e-06, + "loss": 1.0419, + "step": 30 + }, + { + "epoch": 0.21, + "grad_norm": 2.625, + "learning_rate": 2.9898217064817673e-06, + "loss": 1.15, + "step": 31 + }, + { + "epoch": 0.22, + "grad_norm": 2.359375, + "learning_rate": 2.98883049859511e-06, + "loss": 1.0245, + "step": 32 + }, + { + "epoch": 0.23, + "grad_norm": 2.40625, + "learning_rate": 2.987793422579508e-06, + "loss": 1.0391, + "step": 33 + }, + { + "epoch": 0.24, + "grad_norm": 2.234375, + "learning_rate": 2.9867105103853648e-06, + "loss": 1.0343, + "step": 34 + }, + { + "epoch": 0.24, + "grad_norm": 2.234375, + "learning_rate": 2.985581795375214e-06, + "loss": 1.0316, + "step": 35 + }, + { + "epoch": 0.25, + "grad_norm": 2.265625, + "learning_rate": 2.984407312322688e-06, + "loss": 1.0453, + "step": 36 + }, + { + "epoch": 0.26, + "grad_norm": 2.28125, + "learning_rate": 2.98318709741145e-06, + "loss": 1.0455, + "step": 37 + }, + { + "epoch": 0.26, + "grad_norm": 2.1875, + "learning_rate": 2.9819211882340754e-06, + "loss": 1.033, + "step": 38 + }, + { + "epoch": 0.27, + "grad_norm": 2.28125, + "learning_rate": 2.9806096237908986e-06, + "loss": 1.0303, + "step": 39 + }, + { + "epoch": 0.28, + "grad_norm": 2.171875, + "learning_rate": 2.9792524444888073e-06, + "loss": 1.0006, + "step": 40 + }, + { + "epoch": 0.28, + "grad_norm": 2.296875, + "learning_rate": 2.977849692139999e-06, + "loss": 1.029, + "step": 41 + }, + { + "epoch": 0.29, + "grad_norm": 2.25, + "learning_rate": 2.976401409960693e-06, + "loss": 1.048, + "step": 42 + }, + { + "epoch": 0.3, + "grad_norm": 2.203125, + "learning_rate": 2.9749076425697996e-06, + "loss": 1.0164, + "step": 43 + }, + { + "epoch": 0.3, + "grad_norm": 2.203125, + "learning_rate": 2.973368435987544e-06, + "loss": 1.0075, + "step": 44 + }, + { + "epoch": 0.31, + "grad_norm": 2.203125, + "learning_rate": 2.9717838376340497e-06, + "loss": 1.0364, + "step": 45 + }, + { + "epoch": 0.32, + "grad_norm": 2.15625, + "learning_rate": 2.9701538963278767e-06, + "loss": 1.0127, + "step": 46 + }, + { + "epoch": 0.33, + "grad_norm": 2.15625, + "learning_rate": 2.968478662284519e-06, + "loss": 0.9968, + "step": 47 + }, + { + "epoch": 0.33, + "grad_norm": 2.1875, + "learning_rate": 2.9667581871148553e-06, + "loss": 1.0087, + "step": 48 + }, + { + "epoch": 0.34, + "grad_norm": 2.203125, + "learning_rate": 2.9649925238235613e-06, + "loss": 0.9926, + "step": 49 + }, + { + "epoch": 0.35, + "grad_norm": 2.1875, + "learning_rate": 2.963181726807475e-06, + "loss": 1.0005, + "step": 50 + }, + { + "epoch": 0.35, + "grad_norm": 2.1875, + "learning_rate": 2.9613258518539204e-06, + "loss": 1.0029, + "step": 51 + }, + { + "epoch": 0.36, + "grad_norm": 2.1875, + "learning_rate": 2.9594249561389925e-06, + "loss": 1.0475, + "step": 52 + }, + { + "epoch": 0.37, + "grad_norm": 2.203125, + "learning_rate": 2.957479098225791e-06, + "loss": 0.9926, + "step": 53 + }, + { + "epoch": 0.37, + "grad_norm": 2.125, + "learning_rate": 2.9554883380626183e-06, + "loss": 1.0115, + "step": 54 + }, + { + "epoch": 0.38, + "grad_norm": 2.140625, + "learning_rate": 2.9534527369811333e-06, + "loss": 0.9977, + "step": 55 + }, + { + "epoch": 0.39, + "grad_norm": 2.265625, + "learning_rate": 2.951372357694461e-06, + "loss": 1.0862, + "step": 56 + }, + { + "epoch": 0.39, + "grad_norm": 2.234375, + "learning_rate": 2.94924726429526e-06, + "loss": 1.0044, + "step": 57 + }, + { + "epoch": 0.4, + "grad_norm": 2.171875, + "learning_rate": 2.947077522253749e-06, + "loss": 0.9761, + "step": 58 + }, + { + "epoch": 0.41, + "grad_norm": 2.15625, + "learning_rate": 2.9448631984156895e-06, + "loss": 1.0414, + "step": 59 + }, + { + "epoch": 0.42, + "grad_norm": 2.125, + "learning_rate": 2.9426043610003245e-06, + "loss": 0.9801, + "step": 60 + }, + { + "epoch": 0.42, + "grad_norm": 2.046875, + "learning_rate": 2.9403010795982812e-06, + "loss": 0.9832, + "step": 61 + }, + { + "epoch": 0.43, + "grad_norm": 2.15625, + "learning_rate": 2.9379534251694214e-06, + "loss": 0.9952, + "step": 62 + }, + { + "epoch": 0.44, + "grad_norm": 2.203125, + "learning_rate": 2.93556147004066e-06, + "loss": 0.9998, + "step": 63 + }, + { + "epoch": 0.44, + "grad_norm": 2.203125, + "learning_rate": 2.933125287903734e-06, + "loss": 0.9714, + "step": 64 + }, + { + "epoch": 0.45, + "grad_norm": 2.21875, + "learning_rate": 2.9306449538129346e-06, + "loss": 0.9968, + "step": 65 + }, + { + "epoch": 0.46, + "grad_norm": 2.09375, + "learning_rate": 2.9281205441827918e-06, + "loss": 1.0091, + "step": 66 + }, + { + "epoch": 0.46, + "grad_norm": 2.078125, + "learning_rate": 2.925552136785723e-06, + "loss": 0.9731, + "step": 67 + }, + { + "epoch": 0.47, + "grad_norm": 2.09375, + "learning_rate": 2.9229398107496354e-06, + "loss": 0.9712, + "step": 68 + }, + { + "epoch": 0.48, + "grad_norm": 2.171875, + "learning_rate": 2.920283646555489e-06, + "loss": 0.9796, + "step": 69 + }, + { + "epoch": 0.48, + "grad_norm": 2.234375, + "learning_rate": 2.917583726034816e-06, + "loss": 1.0048, + "step": 70 + }, + { + "epoch": 0.49, + "grad_norm": 2.09375, + "learning_rate": 2.9148401323672014e-06, + "loss": 0.9751, + "step": 71 + }, + { + "epoch": 0.5, + "grad_norm": 2.09375, + "learning_rate": 2.9120529500777193e-06, + "loss": 1.0171, + "step": 72 + }, + { + "epoch": 0.51, + "grad_norm": 2.109375, + "learning_rate": 2.9092222650343288e-06, + "loss": 1.019, + "step": 73 + }, + { + "epoch": 0.51, + "grad_norm": 2.1875, + "learning_rate": 2.90634816444523e-06, + "loss": 0.9982, + "step": 74 + }, + { + "epoch": 0.52, + "grad_norm": 2.03125, + "learning_rate": 2.903430736856174e-06, + "loss": 0.9658, + "step": 75 + }, + { + "epoch": 0.53, + "grad_norm": 2.1875, + "learning_rate": 2.900470072147738e-06, + "loss": 0.9983, + "step": 76 + }, + { + "epoch": 0.53, + "grad_norm": 2.1875, + "learning_rate": 2.8974662615325564e-06, + "loss": 0.9966, + "step": 77 + }, + { + "epoch": 0.54, + "grad_norm": 2.203125, + "learning_rate": 2.894419397552508e-06, + "loss": 0.9998, + "step": 78 + }, + { + "epoch": 0.55, + "grad_norm": 2.078125, + "learning_rate": 2.8913295740758676e-06, + "loss": 0.9919, + "step": 79 + }, + { + "epoch": 0.55, + "grad_norm": 2.15625, + "learning_rate": 2.888196886294413e-06, + "loss": 1.0071, + "step": 80 + }, + { + "epoch": 0.56, + "grad_norm": 2.21875, + "learning_rate": 2.885021430720492e-06, + "loss": 0.9911, + "step": 81 + }, + { + "epoch": 0.57, + "grad_norm": 2.109375, + "learning_rate": 2.8818033051840494e-06, + "loss": 1.0248, + "step": 82 + }, + { + "epoch": 0.57, + "grad_norm": 2.0625, + "learning_rate": 2.878542608829613e-06, + "loss": 0.9623, + "step": 83 + }, + { + "epoch": 0.58, + "grad_norm": 2.15625, + "learning_rate": 2.87523944211324e-06, + "loss": 0.9953, + "step": 84 + }, + { + "epoch": 0.59, + "grad_norm": 2.078125, + "learning_rate": 2.87189390679942e-06, + "loss": 0.9831, + "step": 85 + }, + { + "epoch": 0.6, + "grad_norm": 2.15625, + "learning_rate": 2.8685061059579428e-06, + "loss": 0.9882, + "step": 86 + }, + { + "epoch": 0.6, + "grad_norm": 2.046875, + "learning_rate": 2.8650761439607194e-06, + "loss": 1.0026, + "step": 87 + }, + { + "epoch": 0.61, + "grad_norm": 2.078125, + "learning_rate": 2.8616041264785704e-06, + "loss": 0.9555, + "step": 88 + }, + { + "epoch": 0.62, + "grad_norm": 2.078125, + "learning_rate": 2.8580901604779672e-06, + "loss": 0.9886, + "step": 89 + }, + { + "epoch": 0.62, + "grad_norm": 2.140625, + "learning_rate": 2.854534354217738e-06, + "loss": 0.9699, + "step": 90 + }, + { + "epoch": 0.63, + "grad_norm": 2.078125, + "learning_rate": 2.8509368172457333e-06, + "loss": 0.9705, + "step": 91 + }, + { + "epoch": 0.64, + "grad_norm": 2.140625, + "learning_rate": 2.8472976603954494e-06, + "loss": 0.9835, + "step": 92 + }, + { + "epoch": 0.64, + "grad_norm": 2.078125, + "learning_rate": 2.843616995782614e-06, + "loss": 0.9595, + "step": 93 + }, + { + "epoch": 0.65, + "grad_norm": 2.15625, + "learning_rate": 2.8398949368017344e-06, + "loss": 1.0608, + "step": 94 + }, + { + "epoch": 0.66, + "grad_norm": 2.203125, + "learning_rate": 2.8361315981226e-06, + "loss": 1.0303, + "step": 95 + }, + { + "epoch": 0.66, + "grad_norm": 2.09375, + "learning_rate": 2.832327095686753e-06, + "loss": 0.9669, + "step": 96 + }, + { + "epoch": 0.67, + "grad_norm": 2.203125, + "learning_rate": 2.828481546703915e-06, + "loss": 1.0021, + "step": 97 + }, + { + "epoch": 0.68, + "grad_norm": 2.15625, + "learning_rate": 2.8245950696483764e-06, + "loss": 0.972, + "step": 98 + }, + { + "epoch": 0.69, + "grad_norm": 2.09375, + "learning_rate": 2.8206677842553465e-06, + "loss": 0.9734, + "step": 99 + }, + { + "epoch": 0.69, + "grad_norm": 2.078125, + "learning_rate": 2.8166998115172635e-06, + "loss": 0.9715, + "step": 100 + }, + { + "epoch": 0.7, + "grad_norm": 2.125, + "learning_rate": 2.812691273680068e-06, + "loss": 0.9896, + "step": 101 + }, + { + "epoch": 0.71, + "grad_norm": 2.125, + "learning_rate": 2.808642294239438e-06, + "loss": 0.9473, + "step": 102 + }, + { + "epoch": 0.71, + "grad_norm": 2.140625, + "learning_rate": 2.8045529979369805e-06, + "loss": 0.9784, + "step": 103 + }, + { + "epoch": 0.72, + "grad_norm": 2.15625, + "learning_rate": 2.800423510756394e-06, + "loss": 0.9566, + "step": 104 + }, + { + "epoch": 0.73, + "grad_norm": 2.21875, + "learning_rate": 2.796253959919581e-06, + "loss": 0.9875, + "step": 105 + }, + { + "epoch": 0.73, + "grad_norm": 2.125, + "learning_rate": 2.7920444738827332e-06, + "loss": 0.9869, + "step": 106 + }, + { + "epoch": 0.74, + "grad_norm": 2.09375, + "learning_rate": 2.7877951823323727e-06, + "loss": 0.9945, + "step": 107 + }, + { + "epoch": 0.75, + "grad_norm": 2.21875, + "learning_rate": 2.783506216181354e-06, + "loss": 0.9652, + "step": 108 + }, + { + "epoch": 0.75, + "grad_norm": 2.09375, + "learning_rate": 2.7791777075648364e-06, + "loss": 0.9878, + "step": 109 + }, + { + "epoch": 0.76, + "grad_norm": 2.140625, + "learning_rate": 2.7748097898362074e-06, + "loss": 0.9595, + "step": 110 + }, + { + "epoch": 0.77, + "grad_norm": 2.140625, + "learning_rate": 2.770402597562977e-06, + "loss": 0.9677, + "step": 111 + }, + { + "epoch": 0.78, + "grad_norm": 2.109375, + "learning_rate": 2.765956266522632e-06, + "loss": 0.9791, + "step": 112 + }, + { + "epoch": 0.78, + "grad_norm": 2.0625, + "learning_rate": 2.761470933698453e-06, + "loss": 0.9541, + "step": 113 + }, + { + "epoch": 0.79, + "grad_norm": 2.09375, + "learning_rate": 2.7569467372752936e-06, + "loss": 0.9573, + "step": 114 + }, + { + "epoch": 0.8, + "grad_norm": 2.0625, + "learning_rate": 2.752383816635323e-06, + "loss": 0.9416, + "step": 115 + }, + { + "epoch": 0.8, + "grad_norm": 2.109375, + "learning_rate": 2.747782312353733e-06, + "loss": 0.9575, + "step": 116 + }, + { + "epoch": 0.81, + "grad_norm": 2.109375, + "learning_rate": 2.7431423661944057e-06, + "loss": 0.9932, + "step": 117 + }, + { + "epoch": 0.82, + "grad_norm": 2.125, + "learning_rate": 2.7384641211055474e-06, + "loss": 0.9394, + "step": 118 + }, + { + "epoch": 0.82, + "grad_norm": 2.171875, + "learning_rate": 2.733747721215283e-06, + "loss": 0.9775, + "step": 119 + }, + { + "epoch": 0.83, + "grad_norm": 2.109375, + "learning_rate": 2.728993311827218e-06, + "loss": 0.9711, + "step": 120 + }, + { + "epoch": 0.84, + "grad_norm": 2.03125, + "learning_rate": 2.724201039415959e-06, + "loss": 0.9576, + "step": 121 + }, + { + "epoch": 0.84, + "grad_norm": 2.171875, + "learning_rate": 2.7193710516226047e-06, + "loss": 0.9695, + "step": 122 + }, + { + "epoch": 0.85, + "grad_norm": 2.09375, + "learning_rate": 2.714503497250193e-06, + "loss": 0.979, + "step": 123 + }, + { + "epoch": 0.86, + "grad_norm": 2.0625, + "learning_rate": 2.709598526259121e-06, + "loss": 0.9854, + "step": 124 + }, + { + "epoch": 0.87, + "grad_norm": 2.109375, + "learning_rate": 2.7046562897625218e-06, + "loss": 0.9882, + "step": 125 + }, + { + "epoch": 0.87, + "grad_norm": 2.078125, + "learning_rate": 2.699676940021611e-06, + "loss": 0.9738, + "step": 126 + }, + { + "epoch": 0.88, + "grad_norm": 2.140625, + "learning_rate": 2.694660630440994e-06, + "loss": 0.9692, + "step": 127 + }, + { + "epoch": 0.89, + "grad_norm": 2.0625, + "learning_rate": 2.6896075155639417e-06, + "loss": 0.9882, + "step": 128 + }, + { + "epoch": 0.89, + "grad_norm": 2.0625, + "learning_rate": 2.684517751067629e-06, + "loss": 0.9721, + "step": 129 + }, + { + "epoch": 0.9, + "grad_norm": 2.125, + "learning_rate": 2.6793914937583356e-06, + "loss": 0.9631, + "step": 130 + }, + { + "epoch": 0.91, + "grad_norm": 2.109375, + "learning_rate": 2.6742289015666204e-06, + "loss": 0.9752, + "step": 131 + }, + { + "epoch": 0.91, + "grad_norm": 2.109375, + "learning_rate": 2.6690301335424527e-06, + "loss": 0.9938, + "step": 132 + }, + { + "epoch": 0.92, + "grad_norm": 2.109375, + "learning_rate": 2.663795349850312e-06, + "loss": 0.9598, + "step": 133 + }, + { + "epoch": 0.93, + "grad_norm": 2.03125, + "learning_rate": 2.6585247117642533e-06, + "loss": 0.9711, + "step": 134 + }, + { + "epoch": 0.93, + "grad_norm": 2.1875, + "learning_rate": 2.6532183816629417e-06, + "loss": 0.9913, + "step": 135 + }, + { + "epoch": 0.94, + "grad_norm": 2.109375, + "learning_rate": 2.6478765230246463e-06, + "loss": 0.9741, + "step": 136 + }, + { + "epoch": 0.95, + "grad_norm": 2.03125, + "learning_rate": 2.6424993004222054e-06, + "loss": 0.9541, + "step": 137 + }, + { + "epoch": 0.96, + "grad_norm": 2.125, + "learning_rate": 2.637086879517956e-06, + "loss": 0.9721, + "step": 138 + }, + { + "epoch": 0.96, + "grad_norm": 2.09375, + "learning_rate": 2.6316394270586294e-06, + "loss": 0.9622, + "step": 139 + }, + { + "epoch": 0.97, + "grad_norm": 2.0625, + "learning_rate": 2.6261571108702162e-06, + "loss": 0.9486, + "step": 140 + }, + { + "epoch": 0.98, + "grad_norm": 2.125, + "learning_rate": 2.620640099852793e-06, + "loss": 0.9579, + "step": 141 + }, + { + "epoch": 0.98, + "grad_norm": 2.109375, + "learning_rate": 2.6150885639753198e-06, + "loss": 0.9682, + "step": 142 + }, + { + "epoch": 0.99, + "grad_norm": 2.140625, + "learning_rate": 2.6095026742704063e-06, + "loss": 1.0175, + "step": 143 + }, + { + "epoch": 1.0, + "grad_norm": 2.09375, + "learning_rate": 2.6038826028290377e-06, + "loss": 0.9619, + "step": 144 + }, + { + "epoch": 1.0, + "grad_norm": 2.125, + "learning_rate": 2.598228522795278e-06, + "loss": 1.0443, + "step": 145 + }, + { + "epoch": 1.01, + "grad_norm": 2.046875, + "learning_rate": 2.5925406083609323e-06, + "loss": 0.9441, + "step": 146 + }, + { + "epoch": 1.0, + "grad_norm": 2.140625, + "learning_rate": 2.5868190347601805e-06, + "loss": 1.0083, + "step": 147 + }, + { + "epoch": 1.01, + "grad_norm": 2.09375, + "learning_rate": 2.5810639782641804e-06, + "loss": 0.9488, + "step": 148 + }, + { + "epoch": 1.02, + "grad_norm": 2.0, + "learning_rate": 2.575275616175637e-06, + "loss": 0.9293, + "step": 149 + }, + { + "epoch": 1.02, + "grad_norm": 2.09375, + "learning_rate": 2.5694541268233384e-06, + "loss": 0.9648, + "step": 150 + }, + { + "epoch": 1.03, + "grad_norm": 2.015625, + "learning_rate": 2.563599689556662e-06, + "loss": 0.9557, + "step": 151 + }, + { + "epoch": 1.04, + "grad_norm": 2.109375, + "learning_rate": 2.557712484740051e-06, + "loss": 0.9478, + "step": 152 + }, + { + "epoch": 1.04, + "grad_norm": 2.125, + "learning_rate": 2.551792693747457e-06, + "loss": 0.9569, + "step": 153 + }, + { + "epoch": 1.05, + "grad_norm": 2.109375, + "learning_rate": 2.5458404989567504e-06, + "loss": 1.0341, + "step": 154 + }, + { + "epoch": 1.06, + "grad_norm": 2.09375, + "learning_rate": 2.5398560837441037e-06, + "loss": 0.9795, + "step": 155 + }, + { + "epoch": 1.07, + "grad_norm": 2.03125, + "learning_rate": 2.5338396324783407e-06, + "loss": 0.9526, + "step": 156 + }, + { + "epoch": 1.07, + "grad_norm": 2.046875, + "learning_rate": 2.527791330515258e-06, + "loss": 0.9531, + "step": 157 + }, + { + "epoch": 1.08, + "grad_norm": 2.078125, + "learning_rate": 2.5217113641919136e-06, + "loss": 0.9193, + "step": 158 + }, + { + "epoch": 1.09, + "grad_norm": 2.109375, + "learning_rate": 2.5155999208208857e-06, + "loss": 0.949, + "step": 159 + }, + { + "epoch": 1.09, + "grad_norm": 2.15625, + "learning_rate": 2.5094571886845013e-06, + "loss": 0.9595, + "step": 160 + }, + { + "epoch": 1.1, + "grad_norm": 2.0625, + "learning_rate": 2.5032833570290396e-06, + "loss": 0.916, + "step": 161 + }, + { + "epoch": 1.11, + "grad_norm": 2.140625, + "learning_rate": 2.4970786160588956e-06, + "loss": 0.9826, + "step": 162 + }, + { + "epoch": 1.11, + "grad_norm": 2.109375, + "learning_rate": 2.4908431569307268e-06, + "loss": 1.0362, + "step": 163 + }, + { + "epoch": 1.12, + "grad_norm": 2.015625, + "learning_rate": 2.4845771717475563e-06, + "loss": 0.9425, + "step": 164 + }, + { + "epoch": 1.13, + "grad_norm": 2.046875, + "learning_rate": 2.478280853552863e-06, + "loss": 0.9522, + "step": 165 + }, + { + "epoch": 1.13, + "grad_norm": 2.03125, + "learning_rate": 2.4719543963246275e-06, + "loss": 0.9401, + "step": 166 + }, + { + "epoch": 1.14, + "grad_norm": 2.21875, + "learning_rate": 2.4655979949693605e-06, + "loss": 0.9662, + "step": 167 + }, + { + "epoch": 1.15, + "grad_norm": 2.078125, + "learning_rate": 2.459211845316094e-06, + "loss": 0.9211, + "step": 168 + }, + { + "epoch": 1.16, + "grad_norm": 2.046875, + "learning_rate": 2.4527961441103515e-06, + "loss": 0.9594, + "step": 169 + }, + { + "epoch": 1.16, + "grad_norm": 2.15625, + "learning_rate": 2.4463510890080865e-06, + "loss": 0.9591, + "step": 170 + }, + { + "epoch": 1.17, + "grad_norm": 2.0625, + "learning_rate": 2.4398768785695893e-06, + "loss": 0.92, + "step": 171 + }, + { + "epoch": 1.18, + "grad_norm": 2.125, + "learning_rate": 2.433373712253376e-06, + "loss": 0.9829, + "step": 172 + }, + { + "epoch": 1.18, + "grad_norm": 2.09375, + "learning_rate": 2.426841790410037e-06, + "loss": 0.961, + "step": 173 + }, + { + "epoch": 1.19, + "grad_norm": 2.0625, + "learning_rate": 2.4202813142760685e-06, + "loss": 0.9428, + "step": 174 + }, + { + "epoch": 1.2, + "grad_norm": 2.15625, + "learning_rate": 2.413692485967673e-06, + "loss": 0.9671, + "step": 175 + }, + { + "epoch": 1.2, + "grad_norm": 2.125, + "learning_rate": 2.40707550847453e-06, + "loss": 0.9428, + "step": 176 + }, + { + "epoch": 1.21, + "grad_norm": 2.078125, + "learning_rate": 2.400430585653544e-06, + "loss": 0.9587, + "step": 177 + }, + { + "epoch": 1.22, + "grad_norm": 2.09375, + "learning_rate": 2.3937579222225646e-06, + "loss": 0.9817, + "step": 178 + }, + { + "epoch": 1.22, + "grad_norm": 2.0625, + "learning_rate": 2.387057723754076e-06, + "loss": 0.9479, + "step": 179 + }, + { + "epoch": 1.23, + "grad_norm": 2.078125, + "learning_rate": 2.38033019666887e-06, + "loss": 0.9572, + "step": 180 + }, + { + "epoch": 1.24, + "grad_norm": 2.125, + "learning_rate": 2.3735755482296788e-06, + "loss": 0.9411, + "step": 181 + }, + { + "epoch": 1.25, + "grad_norm": 2.046875, + "learning_rate": 2.3667939865347966e-06, + "loss": 0.9485, + "step": 182 + }, + { + "epoch": 1.25, + "grad_norm": 2.0625, + "learning_rate": 2.359985720511665e-06, + "loss": 0.9688, + "step": 183 + }, + { + "epoch": 1.26, + "grad_norm": 2.125, + "learning_rate": 2.353150959910435e-06, + "loss": 0.9546, + "step": 184 + }, + { + "epoch": 1.27, + "grad_norm": 2.15625, + "learning_rate": 2.346289915297509e-06, + "loss": 0.9551, + "step": 185 + }, + { + "epoch": 1.27, + "grad_norm": 2.078125, + "learning_rate": 2.33940279804905e-06, + "loss": 0.9427, + "step": 186 + }, + { + "epoch": 1.28, + "grad_norm": 2.046875, + "learning_rate": 2.332489820344472e-06, + "loss": 0.9466, + "step": 187 + }, + { + "epoch": 1.29, + "grad_norm": 2.15625, + "learning_rate": 2.325551195159902e-06, + "loss": 0.9604, + "step": 188 + }, + { + "epoch": 1.29, + "grad_norm": 2.015625, + "learning_rate": 2.3185871362616168e-06, + "loss": 0.9481, + "step": 189 + }, + { + "epoch": 1.3, + "grad_norm": 2.078125, + "learning_rate": 2.311597858199462e-06, + "loss": 0.9491, + "step": 190 + }, + { + "epoch": 1.31, + "grad_norm": 2.125, + "learning_rate": 2.3045835763002377e-06, + "loss": 0.9384, + "step": 191 + }, + { + "epoch": 1.31, + "grad_norm": 2.109375, + "learning_rate": 2.2975445066610655e-06, + "loss": 0.9591, + "step": 192 + }, + { + "epoch": 1.32, + "grad_norm": 2.0625, + "learning_rate": 2.290480866142733e-06, + "loss": 0.9196, + "step": 193 + }, + { + "epoch": 1.33, + "grad_norm": 2.046875, + "learning_rate": 2.2833928723630116e-06, + "loss": 0.956, + "step": 194 + }, + { + "epoch": 1.34, + "grad_norm": 2.0625, + "learning_rate": 2.2762807436899505e-06, + "loss": 0.9479, + "step": 195 + }, + { + "epoch": 1.34, + "grad_norm": 2.109375, + "learning_rate": 2.269144699235152e-06, + "loss": 0.9414, + "step": 196 + }, + { + "epoch": 1.35, + "grad_norm": 2.125, + "learning_rate": 2.2619849588470177e-06, + "loss": 0.9403, + "step": 197 + }, + { + "epoch": 1.36, + "grad_norm": 2.0625, + "learning_rate": 2.2548017431039793e-06, + "loss": 0.9684, + "step": 198 + }, + { + "epoch": 1.36, + "grad_norm": 2.109375, + "learning_rate": 2.247595273307701e-06, + "loss": 0.9428, + "step": 199 + }, + { + "epoch": 1.37, + "grad_norm": 2.15625, + "learning_rate": 2.240365771476259e-06, + "loss": 0.9678, + "step": 200 + }, + { + "epoch": 1.38, + "grad_norm": 2.140625, + "learning_rate": 2.233113460337308e-06, + "loss": 0.9463, + "step": 201 + }, + { + "epoch": 1.38, + "grad_norm": 2.046875, + "learning_rate": 2.2258385633212116e-06, + "loss": 0.929, + "step": 202 + }, + { + "epoch": 1.39, + "grad_norm": 2.046875, + "learning_rate": 2.218541304554167e-06, + "loss": 1.0058, + "step": 203 + }, + { + "epoch": 1.4, + "grad_norm": 2.078125, + "learning_rate": 2.2112219088512914e-06, + "loss": 0.9762, + "step": 204 + }, + { + "epoch": 1.4, + "grad_norm": 2.109375, + "learning_rate": 2.2038806017097022e-06, + "loss": 0.9186, + "step": 205 + }, + { + "epoch": 1.41, + "grad_norm": 2.0625, + "learning_rate": 2.1965176093015694e-06, + "loss": 0.9142, + "step": 206 + }, + { + "epoch": 1.42, + "grad_norm": 2.09375, + "learning_rate": 2.1891331584671447e-06, + "loss": 0.9081, + "step": 207 + }, + { + "epoch": 1.43, + "grad_norm": 2.078125, + "learning_rate": 2.1817274767077735e-06, + "loss": 0.9452, + "step": 208 + }, + { + "epoch": 1.43, + "grad_norm": 2.171875, + "learning_rate": 2.1743007921788887e-06, + "loss": 0.9451, + "step": 209 + }, + { + "epoch": 1.44, + "grad_norm": 2.03125, + "learning_rate": 2.166853333682979e-06, + "loss": 0.9489, + "step": 210 + }, + { + "epoch": 1.45, + "grad_norm": 2.09375, + "learning_rate": 2.159385330662541e-06, + "loss": 0.9469, + "step": 211 + }, + { + "epoch": 1.45, + "grad_norm": 2.1875, + "learning_rate": 2.1518970131930106e-06, + "loss": 0.9583, + "step": 212 + }, + { + "epoch": 1.46, + "grad_norm": 2.140625, + "learning_rate": 2.144388611975674e-06, + "loss": 0.9624, + "step": 213 + }, + { + "epoch": 1.47, + "grad_norm": 2.09375, + "learning_rate": 2.1368603583305603e-06, + "loss": 0.9407, + "step": 214 + }, + { + "epoch": 1.47, + "grad_norm": 2.21875, + "learning_rate": 2.129312484189317e-06, + "loss": 0.9319, + "step": 215 + }, + { + "epoch": 1.48, + "grad_norm": 2.0625, + "learning_rate": 2.1217452220880612e-06, + "loss": 0.9579, + "step": 216 + }, + { + "epoch": 1.49, + "grad_norm": 1.984375, + "learning_rate": 2.1141588051602192e-06, + "loss": 0.9314, + "step": 217 + }, + { + "epoch": 1.49, + "grad_norm": 2.0625, + "learning_rate": 2.1065534671293404e-06, + "loss": 0.9589, + "step": 218 + }, + { + "epoch": 1.5, + "grad_norm": 2.0, + "learning_rate": 2.0989294423018997e-06, + "loss": 0.9262, + "step": 219 + }, + { + "epoch": 1.51, + "grad_norm": 2.09375, + "learning_rate": 2.0912869655600775e-06, + "loss": 0.9575, + "step": 220 + }, + { + "epoch": 1.52, + "grad_norm": 2.09375, + "learning_rate": 2.0836262723545242e-06, + "loss": 0.9536, + "step": 221 + }, + { + "epoch": 1.52, + "grad_norm": 2.0625, + "learning_rate": 2.0759475986971055e-06, + "loss": 0.9357, + "step": 222 + }, + { + "epoch": 1.53, + "grad_norm": 2.109375, + "learning_rate": 2.0682511811536306e-06, + "loss": 0.9544, + "step": 223 + }, + { + "epoch": 1.54, + "grad_norm": 2.109375, + "learning_rate": 2.0605372568365683e-06, + "loss": 0.9383, + "step": 224 + }, + { + "epoch": 1.54, + "grad_norm": 2.09375, + "learning_rate": 2.052806063397736e-06, + "loss": 0.9486, + "step": 225 + }, + { + "epoch": 1.55, + "grad_norm": 2.0625, + "learning_rate": 2.045057839020981e-06, + "loss": 0.9822, + "step": 226 + }, + { + "epoch": 1.56, + "grad_norm": 2.078125, + "learning_rate": 2.0372928224148454e-06, + "loss": 0.9422, + "step": 227 + }, + { + "epoch": 1.56, + "grad_norm": 2.09375, + "learning_rate": 2.029511252805205e-06, + "loss": 0.9402, + "step": 228 + }, + { + "epoch": 1.57, + "grad_norm": 2.09375, + "learning_rate": 2.0217133699279074e-06, + "loss": 0.9067, + "step": 229 + }, + { + "epoch": 1.58, + "grad_norm": 1.96875, + "learning_rate": 2.013899414021378e-06, + "loss": 0.9232, + "step": 230 + }, + { + "epoch": 1.58, + "grad_norm": 2.09375, + "learning_rate": 2.006069625819225e-06, + "loss": 0.9292, + "step": 231 + }, + { + "epoch": 1.59, + "grad_norm": 2.078125, + "learning_rate": 1.9982242465428196e-06, + "loss": 0.9309, + "step": 232 + }, + { + "epoch": 1.6, + "grad_norm": 2.109375, + "learning_rate": 1.990363517893867e-06, + "loss": 0.9446, + "step": 233 + }, + { + "epoch": 1.61, + "grad_norm": 2.28125, + "learning_rate": 1.9824876820469553e-06, + "loss": 0.9676, + "step": 234 + }, + { + "epoch": 1.61, + "grad_norm": 2.171875, + "learning_rate": 1.9745969816421e-06, + "loss": 0.9603, + "step": 235 + }, + { + "epoch": 1.62, + "grad_norm": 2.125, + "learning_rate": 1.9666916597772663e-06, + "loss": 0.9393, + "step": 236 + }, + { + "epoch": 1.63, + "grad_norm": 2.078125, + "learning_rate": 1.9587719600008777e-06, + "loss": 0.9352, + "step": 237 + }, + { + "epoch": 1.63, + "grad_norm": 2.09375, + "learning_rate": 1.9508381263043165e-06, + "loss": 0.9233, + "step": 238 + }, + { + "epoch": 1.64, + "grad_norm": 2.09375, + "learning_rate": 1.942890403114404e-06, + "loss": 0.9573, + "step": 239 + }, + { + "epoch": 1.65, + "grad_norm": 2.09375, + "learning_rate": 1.9349290352858735e-06, + "loss": 0.9531, + "step": 240 + }, + { + "epoch": 1.65, + "grad_norm": 2.046875, + "learning_rate": 1.9269542680938204e-06, + "loss": 0.9406, + "step": 241 + }, + { + "epoch": 1.66, + "grad_norm": 2.15625, + "learning_rate": 1.918966347226154e-06, + "loss": 0.934, + "step": 242 + }, + { + "epoch": 1.67, + "grad_norm": 2.171875, + "learning_rate": 1.910965518776022e-06, + "loss": 0.9359, + "step": 243 + }, + { + "epoch": 1.67, + "grad_norm": 2.171875, + "learning_rate": 1.9029520292342306e-06, + "loss": 0.9453, + "step": 244 + }, + { + "epoch": 1.68, + "grad_norm": 2.0625, + "learning_rate": 1.8949261254816526e-06, + "loss": 0.9516, + "step": 245 + }, + { + "epoch": 1.69, + "grad_norm": 2.0, + "learning_rate": 1.8868880547816187e-06, + "loss": 0.9814, + "step": 246 + }, + { + "epoch": 1.7, + "grad_norm": 2.109375, + "learning_rate": 1.8788380647723008e-06, + "loss": 0.9549, + "step": 247 + }, + { + "epoch": 1.7, + "grad_norm": 2.078125, + "learning_rate": 1.870776403459083e-06, + "loss": 0.9348, + "step": 248 + }, + { + "epoch": 1.71, + "grad_norm": 2.140625, + "learning_rate": 1.8627033192069213e-06, + "loss": 0.9241, + "step": 249 + }, + { + "epoch": 1.72, + "grad_norm": 2.0625, + "learning_rate": 1.8546190607326902e-06, + "loss": 0.9282, + "step": 250 + }, + { + "epoch": 1.72, + "grad_norm": 2.0625, + "learning_rate": 1.8465238770975237e-06, + "loss": 0.9574, + "step": 251 + }, + { + "epoch": 1.73, + "grad_norm": 2.015625, + "learning_rate": 1.8384180176991368e-06, + "loss": 0.9312, + "step": 252 + }, + { + "epoch": 1.74, + "grad_norm": 2.140625, + "learning_rate": 1.8303017322641474e-06, + "loss": 0.9416, + "step": 253 + }, + { + "epoch": 1.74, + "grad_norm": 2.078125, + "learning_rate": 1.8221752708403801e-06, + "loss": 0.9267, + "step": 254 + }, + { + "epoch": 1.75, + "grad_norm": 2.015625, + "learning_rate": 1.8140388837891622e-06, + "loss": 0.8971, + "step": 255 + }, + { + "epoch": 1.76, + "grad_norm": 2.015625, + "learning_rate": 1.8058928217776125e-06, + "loss": 0.937, + "step": 256 + }, + { + "epoch": 1.76, + "grad_norm": 2.0625, + "learning_rate": 1.7977373357709162e-06, + "loss": 0.9546, + "step": 257 + }, + { + "epoch": 1.77, + "grad_norm": 2.046875, + "learning_rate": 1.789572677024595e-06, + "loss": 1.0132, + "step": 258 + }, + { + "epoch": 1.78, + "grad_norm": 2.015625, + "learning_rate": 1.7813990970767658e-06, + "loss": 0.9453, + "step": 259 + }, + { + "epoch": 1.79, + "grad_norm": 2.09375, + "learning_rate": 1.7732168477403924e-06, + "loss": 0.9584, + "step": 260 + }, + { + "epoch": 1.79, + "grad_norm": 2.109375, + "learning_rate": 1.7650261810955248e-06, + "loss": 0.9375, + "step": 261 + }, + { + "epoch": 1.8, + "grad_norm": 2.0625, + "learning_rate": 1.7568273494815353e-06, + "loss": 0.9378, + "step": 262 + }, + { + "epoch": 1.81, + "grad_norm": 2.109375, + "learning_rate": 1.7486206054893445e-06, + "loss": 0.9765, + "step": 263 + }, + { + "epoch": 1.81, + "grad_norm": 2.046875, + "learning_rate": 1.7404062019536382e-06, + "loss": 0.9465, + "step": 264 + }, + { + "epoch": 1.82, + "grad_norm": 2.046875, + "learning_rate": 1.7321843919450793e-06, + "loss": 0.9333, + "step": 265 + }, + { + "epoch": 1.83, + "grad_norm": 2.140625, + "learning_rate": 1.7239554287625099e-06, + "loss": 0.9359, + "step": 266 + }, + { + "epoch": 1.83, + "grad_norm": 2.078125, + "learning_rate": 1.715719565925148e-06, + "loss": 0.9351, + "step": 267 + }, + { + "epoch": 1.84, + "grad_norm": 2.15625, + "learning_rate": 1.7074770571647776e-06, + "loss": 1.0276, + "step": 268 + }, + { + "epoch": 1.85, + "grad_norm": 2.109375, + "learning_rate": 1.6992281564179327e-06, + "loss": 0.95, + "step": 269 + }, + { + "epoch": 1.85, + "grad_norm": 2.0625, + "learning_rate": 1.69097311781807e-06, + "loss": 0.9435, + "step": 270 + }, + { + "epoch": 1.86, + "grad_norm": 2.09375, + "learning_rate": 1.6827121956877436e-06, + "loss": 0.9449, + "step": 271 + }, + { + "epoch": 1.87, + "grad_norm": 2.140625, + "learning_rate": 1.6744456445307693e-06, + "loss": 0.9555, + "step": 272 + }, + { + "epoch": 1.88, + "grad_norm": 2.09375, + "learning_rate": 1.6661737190243813e-06, + "loss": 0.9541, + "step": 273 + }, + { + "epoch": 1.88, + "grad_norm": 2.0625, + "learning_rate": 1.6578966740113882e-06, + "loss": 0.9581, + "step": 274 + }, + { + "epoch": 1.89, + "grad_norm": 2.03125, + "learning_rate": 1.6496147644923206e-06, + "loss": 0.9577, + "step": 275 + }, + { + "epoch": 1.9, + "grad_norm": 2.09375, + "learning_rate": 1.6413282456175774e-06, + "loss": 0.9523, + "step": 276 + }, + { + "epoch": 1.9, + "grad_norm": 2.1875, + "learning_rate": 1.6330373726795605e-06, + "loss": 0.9418, + "step": 277 + }, + { + "epoch": 1.91, + "grad_norm": 2.078125, + "learning_rate": 1.6247424011048153e-06, + "loss": 0.9264, + "step": 278 + }, + { + "epoch": 1.92, + "grad_norm": 2.09375, + "learning_rate": 1.6164435864461566e-06, + "loss": 0.9561, + "step": 279 + }, + { + "epoch": 1.92, + "grad_norm": 2.078125, + "learning_rate": 1.6081411843747983e-06, + "loss": 0.9189, + "step": 280 + }, + { + "epoch": 1.93, + "grad_norm": 2.046875, + "learning_rate": 1.599835450672476e-06, + "loss": 0.935, + "step": 281 + }, + { + "epoch": 1.94, + "grad_norm": 2.03125, + "learning_rate": 1.5915266412235675e-06, + "loss": 0.9408, + "step": 282 + }, + { + "epoch": 1.94, + "grad_norm": 2.078125, + "learning_rate": 1.5832150120072059e-06, + "loss": 0.9134, + "step": 283 + }, + { + "epoch": 1.95, + "grad_norm": 2.078125, + "learning_rate": 1.5749008190893995e-06, + "loss": 0.9113, + "step": 284 + }, + { + "epoch": 1.96, + "grad_norm": 2.25, + "learning_rate": 1.5665843186151378e-06, + "loss": 0.9446, + "step": 285 + }, + { + "epoch": 1.97, + "grad_norm": 2.1875, + "learning_rate": 1.5582657668005015e-06, + "loss": 0.9548, + "step": 286 + }, + { + "epoch": 1.97, + "grad_norm": 2.046875, + "learning_rate": 1.5499454199247714e-06, + "loss": 0.9186, + "step": 287 + }, + { + "epoch": 1.98, + "grad_norm": 2.09375, + "learning_rate": 1.541623534322528e-06, + "loss": 0.9397, + "step": 288 + }, + { + "epoch": 1.99, + "grad_norm": 2.078125, + "learning_rate": 1.5333003663757585e-06, + "loss": 0.9412, + "step": 289 + }, + { + "epoch": 1.99, + "grad_norm": 2.1875, + "learning_rate": 1.5249761725059577e-06, + "loss": 0.9463, + "step": 290 + }, + { + "epoch": 2.0, + "grad_norm": 2.09375, + "learning_rate": 1.5166512091662264e-06, + "loss": 0.9904, + "step": 291 + }, + { + "epoch": 2.01, + "grad_norm": 2.078125, + "learning_rate": 1.5083257328333697e-06, + "loss": 0.9385, + "step": 292 + }, + { + "epoch": 2.01, + "grad_norm": 2.109375, + "learning_rate": 1.5e-06, + "loss": 0.9535, + "step": 293 + }, + { + "epoch": 2.01, + "grad_norm": 2.140625, + "learning_rate": 1.4916742671666306e-06, + "loss": 0.9471, + "step": 294 + }, + { + "epoch": 2.01, + "grad_norm": 2.078125, + "learning_rate": 1.4833487908337741e-06, + "loss": 0.9482, + "step": 295 + }, + { + "epoch": 2.02, + "grad_norm": 2.046875, + "learning_rate": 1.4750238274940423e-06, + "loss": 0.943, + "step": 296 + }, + { + "epoch": 2.03, + "grad_norm": 2.125, + "learning_rate": 1.4666996336242414e-06, + "loss": 0.9973, + "step": 297 + }, + { + "epoch": 2.03, + "grad_norm": 2.15625, + "learning_rate": 1.4583764656774728e-06, + "loss": 0.9453, + "step": 298 + }, + { + "epoch": 2.04, + "grad_norm": 2.03125, + "learning_rate": 1.4500545800752293e-06, + "loss": 1.0162, + "step": 299 + }, + { + "epoch": 2.05, + "grad_norm": 2.0625, + "learning_rate": 1.4417342331994986e-06, + "loss": 0.9118, + "step": 300 + }, + { + "epoch": 2.06, + "grad_norm": 2.078125, + "learning_rate": 1.4334156813848625e-06, + "loss": 0.9118, + "step": 301 + }, + { + "epoch": 2.06, + "grad_norm": 2.0625, + "learning_rate": 1.4250991809106006e-06, + "loss": 0.9484, + "step": 302 + }, + { + "epoch": 2.07, + "grad_norm": 2.09375, + "learning_rate": 1.4167849879927944e-06, + "loss": 0.9669, + "step": 303 + }, + { + "epoch": 2.08, + "grad_norm": 2.109375, + "learning_rate": 1.4084733587764328e-06, + "loss": 1.0024, + "step": 304 + }, + { + "epoch": 2.08, + "grad_norm": 2.078125, + "learning_rate": 1.4001645493275237e-06, + "loss": 0.9233, + "step": 305 + }, + { + "epoch": 2.09, + "grad_norm": 2.09375, + "learning_rate": 1.3918588156252018e-06, + "loss": 0.925, + "step": 306 + }, + { + "epoch": 2.1, + "grad_norm": 2.0625, + "learning_rate": 1.383556413553844e-06, + "loss": 0.9361, + "step": 307 + }, + { + "epoch": 2.1, + "grad_norm": 2.078125, + "learning_rate": 1.3752575988951854e-06, + "loss": 0.9187, + "step": 308 + }, + { + "epoch": 2.11, + "grad_norm": 2.140625, + "learning_rate": 1.36696262732044e-06, + "loss": 1.0012, + "step": 309 + }, + { + "epoch": 2.12, + "grad_norm": 2.046875, + "learning_rate": 1.3586717543824231e-06, + "loss": 0.928, + "step": 310 + }, + { + "epoch": 2.12, + "grad_norm": 2.140625, + "learning_rate": 1.3503852355076795e-06, + "loss": 0.9427, + "step": 311 + }, + { + "epoch": 2.13, + "grad_norm": 2.09375, + "learning_rate": 1.3421033259886123e-06, + "loss": 0.9211, + "step": 312 + }, + { + "epoch": 2.14, + "grad_norm": 2.140625, + "learning_rate": 1.3338262809756188e-06, + "loss": 1.0299, + "step": 313 + }, + { + "epoch": 2.15, + "grad_norm": 2.03125, + "learning_rate": 1.3255543554692306e-06, + "loss": 0.9101, + "step": 314 + }, + { + "epoch": 2.15, + "grad_norm": 2.140625, + "learning_rate": 1.3172878043122562e-06, + "loss": 0.9423, + "step": 315 + }, + { + "epoch": 2.16, + "grad_norm": 2.0625, + "learning_rate": 1.3090268821819306e-06, + "loss": 0.9319, + "step": 316 + }, + { + "epoch": 2.17, + "grad_norm": 2.078125, + "learning_rate": 1.300771843582068e-06, + "loss": 0.9378, + "step": 317 + }, + { + "epoch": 2.17, + "grad_norm": 2.0625, + "learning_rate": 1.2925229428352225e-06, + "loss": 0.9213, + "step": 318 + }, + { + "epoch": 2.18, + "grad_norm": 2.015625, + "learning_rate": 1.2842804340748524e-06, + "loss": 0.9675, + "step": 319 + }, + { + "epoch": 2.19, + "grad_norm": 2.109375, + "learning_rate": 1.2760445712374906e-06, + "loss": 0.9304, + "step": 320 + }, + { + "epoch": 2.19, + "grad_norm": 2.09375, + "learning_rate": 1.267815608054921e-06, + "loss": 0.9335, + "step": 321 + }, + { + "epoch": 2.2, + "grad_norm": 2.140625, + "learning_rate": 1.2595937980463616e-06, + "loss": 0.9215, + "step": 322 + }, + { + "epoch": 2.21, + "grad_norm": 2.109375, + "learning_rate": 1.2513793945106556e-06, + "loss": 0.9341, + "step": 323 + }, + { + "epoch": 2.21, + "grad_norm": 2.140625, + "learning_rate": 1.2431726505184652e-06, + "loss": 0.9299, + "step": 324 + }, + { + "epoch": 2.22, + "grad_norm": 2.09375, + "learning_rate": 1.234973818904476e-06, + "loss": 0.9259, + "step": 325 + }, + { + "epoch": 2.23, + "grad_norm": 2.15625, + "learning_rate": 1.2267831522596081e-06, + "loss": 0.935, + "step": 326 + }, + { + "epoch": 2.24, + "grad_norm": 2.078125, + "learning_rate": 1.2186009029232343e-06, + "loss": 0.9496, + "step": 327 + }, + { + "epoch": 2.24, + "grad_norm": 2.09375, + "learning_rate": 1.2104273229754053e-06, + "loss": 0.9341, + "step": 328 + }, + { + "epoch": 2.25, + "grad_norm": 2.140625, + "learning_rate": 1.2022626642290843e-06, + "loss": 0.9318, + "step": 329 + }, + { + "epoch": 2.26, + "grad_norm": 2.0625, + "learning_rate": 1.194107178222388e-06, + "loss": 0.9337, + "step": 330 + }, + { + "epoch": 2.26, + "grad_norm": 2.125, + "learning_rate": 1.1859611162108379e-06, + "loss": 0.9496, + "step": 331 + }, + { + "epoch": 2.27, + "grad_norm": 2.109375, + "learning_rate": 1.17782472915962e-06, + "loss": 0.9327, + "step": 332 + }, + { + "epoch": 2.28, + "grad_norm": 2.09375, + "learning_rate": 1.1696982677358533e-06, + "loss": 0.9576, + "step": 333 + }, + { + "epoch": 2.28, + "grad_norm": 2.078125, + "learning_rate": 1.1615819823008639e-06, + "loss": 0.9423, + "step": 334 + }, + { + "epoch": 2.29, + "grad_norm": 2.140625, + "learning_rate": 1.153476122902477e-06, + "loss": 0.991, + "step": 335 + }, + { + "epoch": 2.3, + "grad_norm": 2.0625, + "learning_rate": 1.1453809392673101e-06, + "loss": 0.9247, + "step": 336 + }, + { + "epoch": 2.3, + "grad_norm": 2.03125, + "learning_rate": 1.137296680793079e-06, + "loss": 0.9365, + "step": 337 + }, + { + "epoch": 2.31, + "grad_norm": 2.09375, + "learning_rate": 1.1292235965409171e-06, + "loss": 0.9076, + "step": 338 + }, + { + "epoch": 2.32, + "grad_norm": 2.078125, + "learning_rate": 1.1211619352276997e-06, + "loss": 0.936, + "step": 339 + }, + { + "epoch": 2.33, + "grad_norm": 2.078125, + "learning_rate": 1.1131119452183814e-06, + "loss": 0.9052, + "step": 340 + }, + { + "epoch": 2.33, + "grad_norm": 2.09375, + "learning_rate": 1.1050738745183472e-06, + "loss": 0.9136, + "step": 341 + }, + { + "epoch": 2.34, + "grad_norm": 2.109375, + "learning_rate": 1.0970479707657699e-06, + "loss": 0.9287, + "step": 342 + }, + { + "epoch": 2.35, + "grad_norm": 2.125, + "learning_rate": 1.0890344812239785e-06, + "loss": 0.9556, + "step": 343 + }, + { + "epoch": 2.35, + "grad_norm": 2.09375, + "learning_rate": 1.0810336527738461e-06, + "loss": 0.9015, + "step": 344 + }, + { + "epoch": 2.36, + "grad_norm": 2.0625, + "learning_rate": 1.0730457319061797e-06, + "loss": 0.9284, + "step": 345 + }, + { + "epoch": 2.37, + "grad_norm": 2.0625, + "learning_rate": 1.065070964714127e-06, + "loss": 0.9271, + "step": 346 + }, + { + "epoch": 2.37, + "grad_norm": 1.9921875, + "learning_rate": 1.0571095968855957e-06, + "loss": 0.8869, + "step": 347 + }, + { + "epoch": 2.38, + "grad_norm": 2.0625, + "learning_rate": 1.049161873695684e-06, + "loss": 0.9009, + "step": 348 + }, + { + "epoch": 2.39, + "grad_norm": 2.125, + "learning_rate": 1.0412280399991226e-06, + "loss": 0.9528, + "step": 349 + }, + { + "epoch": 2.39, + "grad_norm": 2.09375, + "learning_rate": 1.033308340222734e-06, + "loss": 0.953, + "step": 350 + }, + { + "epoch": 2.4, + "grad_norm": 2.140625, + "learning_rate": 1.0254030183579003e-06, + "loss": 0.931, + "step": 351 + }, + { + "epoch": 2.41, + "grad_norm": 2.03125, + "learning_rate": 1.0175123179530452e-06, + "loss": 0.9086, + "step": 352 + }, + { + "epoch": 2.42, + "grad_norm": 2.078125, + "learning_rate": 1.0096364821061337e-06, + "loss": 0.9338, + "step": 353 + }, + { + "epoch": 2.42, + "grad_norm": 2.046875, + "learning_rate": 1.0017757534571806e-06, + "loss": 0.9267, + "step": 354 + }, + { + "epoch": 2.43, + "grad_norm": 2.046875, + "learning_rate": 9.939303741807755e-07, + "loss": 0.8946, + "step": 355 + }, + { + "epoch": 2.44, + "grad_norm": 2.109375, + "learning_rate": 9.861005859786223e-07, + "loss": 0.907, + "step": 356 + }, + { + "epoch": 2.44, + "grad_norm": 2.09375, + "learning_rate": 9.78286630072093e-07, + "loss": 0.9366, + "step": 357 + }, + { + "epoch": 2.45, + "grad_norm": 2.15625, + "learning_rate": 9.70488747194795e-07, + "loss": 0.9417, + "step": 358 + }, + { + "epoch": 2.46, + "grad_norm": 2.171875, + "learning_rate": 9.627071775851547e-07, + "loss": 0.9247, + "step": 359 + }, + { + "epoch": 2.46, + "grad_norm": 2.0625, + "learning_rate": 9.549421609790195e-07, + "loss": 0.9132, + "step": 360 + }, + { + "epoch": 2.47, + "grad_norm": 2.09375, + "learning_rate": 9.471939366022647e-07, + "loss": 0.9296, + "step": 361 + }, + { + "epoch": 2.48, + "grad_norm": 2.0625, + "learning_rate": 9.394627431634321e-07, + "loss": 0.9341, + "step": 362 + }, + { + "epoch": 2.48, + "grad_norm": 2.109375, + "learning_rate": 9.317488188463695e-07, + "loss": 0.9266, + "step": 363 + }, + { + "epoch": 2.49, + "grad_norm": 2.140625, + "learning_rate": 9.240524013028948e-07, + "loss": 0.9269, + "step": 364 + }, + { + "epoch": 2.5, + "grad_norm": 2.0625, + "learning_rate": 9.163737276454758e-07, + "loss": 0.9158, + "step": 365 + }, + { + "epoch": 2.51, + "grad_norm": 2.0625, + "learning_rate": 9.087130344399223e-07, + "loss": 0.8931, + "step": 366 + }, + { + "epoch": 2.51, + "grad_norm": 2.078125, + "learning_rate": 9.010705576981002e-07, + "loss": 0.9486, + "step": 367 + }, + { + "epoch": 2.52, + "grad_norm": 2.140625, + "learning_rate": 8.934465328706593e-07, + "loss": 0.9348, + "step": 368 + }, + { + "epoch": 2.53, + "grad_norm": 2.0625, + "learning_rate": 8.858411948397813e-07, + "loss": 0.9305, + "step": 369 + }, + { + "epoch": 2.53, + "grad_norm": 2.109375, + "learning_rate": 8.782547779119386e-07, + "loss": 0.9138, + "step": 370 + }, + { + "epoch": 2.54, + "grad_norm": 2.125, + "learning_rate": 8.706875158106834e-07, + "loss": 0.9354, + "step": 371 + }, + { + "epoch": 2.55, + "grad_norm": 2.0625, + "learning_rate": 8.6313964166944e-07, + "loss": 0.9169, + "step": 372 + }, + { + "epoch": 2.55, + "grad_norm": 2.125, + "learning_rate": 8.556113880243266e-07, + "loss": 0.9358, + "step": 373 + }, + { + "epoch": 2.56, + "grad_norm": 2.125, + "learning_rate": 8.481029868069898e-07, + "loss": 0.9501, + "step": 374 + }, + { + "epoch": 2.57, + "grad_norm": 2.140625, + "learning_rate": 8.406146693374587e-07, + "loss": 0.9193, + "step": 375 + }, + { + "epoch": 2.57, + "grad_norm": 2.15625, + "learning_rate": 8.331466663170208e-07, + "loss": 1.0219, + "step": 376 + }, + { + "epoch": 2.58, + "grad_norm": 2.03125, + "learning_rate": 8.256992078211112e-07, + "loss": 0.9451, + "step": 377 + }, + { + "epoch": 2.59, + "grad_norm": 2.046875, + "learning_rate": 8.182725232922269e-07, + "loss": 0.9171, + "step": 378 + }, + { + "epoch": 2.6, + "grad_norm": 2.125, + "learning_rate": 8.10866841532856e-07, + "loss": 0.923, + "step": 379 + }, + { + "epoch": 2.6, + "grad_norm": 2.109375, + "learning_rate": 8.034823906984308e-07, + "loss": 0.9226, + "step": 380 + }, + { + "epoch": 2.61, + "grad_norm": 2.015625, + "learning_rate": 7.961193982902977e-07, + "loss": 0.9091, + "step": 381 + }, + { + "epoch": 2.62, + "grad_norm": 2.015625, + "learning_rate": 7.88778091148709e-07, + "loss": 0.9069, + "step": 382 + }, + { + "epoch": 2.62, + "grad_norm": 2.078125, + "learning_rate": 7.814586954458334e-07, + "loss": 0.9241, + "step": 383 + }, + { + "epoch": 2.63, + "grad_norm": 2.125, + "learning_rate": 7.741614366787881e-07, + "loss": 0.9249, + "step": 384 + }, + { + "epoch": 2.64, + "grad_norm": 2.015625, + "learning_rate": 7.668865396626924e-07, + "loss": 0.9151, + "step": 385 + }, + { + "epoch": 2.64, + "grad_norm": 2.09375, + "learning_rate": 7.59634228523741e-07, + "loss": 0.9422, + "step": 386 + }, + { + "epoch": 2.65, + "grad_norm": 2.015625, + "learning_rate": 7.524047266922997e-07, + "loss": 0.9168, + "step": 387 + }, + { + "epoch": 2.66, + "grad_norm": 2.171875, + "learning_rate": 7.451982568960207e-07, + "loss": 0.9353, + "step": 388 + }, + { + "epoch": 2.66, + "grad_norm": 2.15625, + "learning_rate": 7.380150411529826e-07, + "loss": 0.9208, + "step": 389 + }, + { + "epoch": 2.67, + "grad_norm": 2.203125, + "learning_rate": 7.308553007648485e-07, + "loss": 0.9216, + "step": 390 + }, + { + "epoch": 2.68, + "grad_norm": 2.09375, + "learning_rate": 7.237192563100496e-07, + "loss": 0.973, + "step": 391 + }, + { + "epoch": 2.69, + "grad_norm": 2.0625, + "learning_rate": 7.166071276369886e-07, + "loss": 0.8989, + "step": 392 + }, + { + "epoch": 2.69, + "grad_norm": 2.0625, + "learning_rate": 7.095191338572666e-07, + "loss": 0.909, + "step": 393 + }, + { + "epoch": 2.7, + "grad_norm": 2.03125, + "learning_rate": 7.024554933389344e-07, + "loss": 0.9199, + "step": 394 + }, + { + "epoch": 2.71, + "grad_norm": 2.0625, + "learning_rate": 6.95416423699763e-07, + "loss": 0.9297, + "step": 395 + }, + { + "epoch": 2.71, + "grad_norm": 2.109375, + "learning_rate": 6.884021418005384e-07, + "loss": 0.9304, + "step": 396 + }, + { + "epoch": 2.72, + "grad_norm": 2.125, + "learning_rate": 6.814128637383837e-07, + "loss": 0.9458, + "step": 397 + }, + { + "epoch": 2.73, + "grad_norm": 2.078125, + "learning_rate": 6.74448804840099e-07, + "loss": 0.9615, + "step": 398 + }, + { + "epoch": 2.73, + "grad_norm": 2.046875, + "learning_rate": 6.675101796555279e-07, + "loss": 0.9203, + "step": 399 + }, + { + "epoch": 2.74, + "grad_norm": 2.09375, + "learning_rate": 6.605972019509501e-07, + "loss": 0.9297, + "step": 400 + }, + { + "epoch": 2.75, + "grad_norm": 2.015625, + "learning_rate": 6.537100847024914e-07, + "loss": 0.9314, + "step": 401 + }, + { + "epoch": 2.75, + "grad_norm": 2.015625, + "learning_rate": 6.468490400895653e-07, + "loss": 0.9189, + "step": 402 + }, + { + "epoch": 2.76, + "grad_norm": 2.046875, + "learning_rate": 6.400142794883356e-07, + "loss": 0.9125, + "step": 403 + }, + { + "epoch": 2.77, + "grad_norm": 2.078125, + "learning_rate": 6.332060134652033e-07, + "loss": 0.9308, + "step": 404 + }, + { + "epoch": 2.78, + "grad_norm": 2.140625, + "learning_rate": 6.264244517703215e-07, + "loss": 0.9439, + "step": 405 + }, + { + "epoch": 2.78, + "grad_norm": 2.109375, + "learning_rate": 6.196698033311305e-07, + "loss": 0.9444, + "step": 406 + }, + { + "epoch": 2.79, + "grad_norm": 2.03125, + "learning_rate": 6.12942276245924e-07, + "loss": 0.9637, + "step": 407 + }, + { + "epoch": 2.8, + "grad_norm": 2.0, + "learning_rate": 6.062420777774359e-07, + "loss": 0.8928, + "step": 408 + }, + { + "epoch": 2.8, + "grad_norm": 2.09375, + "learning_rate": 5.99569414346456e-07, + "loss": 0.9321, + "step": 409 + }, + { + "epoch": 2.81, + "grad_norm": 2.140625, + "learning_rate": 5.929244915254703e-07, + "loss": 0.8973, + "step": 410 + }, + { + "epoch": 2.82, + "grad_norm": 2.140625, + "learning_rate": 5.86307514032327e-07, + "loss": 0.9113, + "step": 411 + }, + { + "epoch": 2.82, + "grad_norm": 2.1875, + "learning_rate": 5.797186857239313e-07, + "loss": 0.9453, + "step": 412 + }, + { + "epoch": 2.83, + "grad_norm": 2.078125, + "learning_rate": 5.731582095899636e-07, + "loss": 0.9417, + "step": 413 + }, + { + "epoch": 2.84, + "grad_norm": 2.03125, + "learning_rate": 5.666262877466246e-07, + "loss": 0.9198, + "step": 414 + }, + { + "epoch": 2.84, + "grad_norm": 2.0625, + "learning_rate": 5.601231214304107e-07, + "loss": 0.9268, + "step": 415 + }, + { + "epoch": 2.85, + "grad_norm": 2.0, + "learning_rate": 5.536489109919141e-07, + "loss": 0.9272, + "step": 416 + }, + { + "epoch": 2.86, + "grad_norm": 2.0625, + "learning_rate": 5.472038558896483e-07, + "loss": 0.9091, + "step": 417 + }, + { + "epoch": 2.87, + "grad_norm": 2.0625, + "learning_rate": 5.40788154683906e-07, + "loss": 0.9193, + "step": 418 + }, + { + "epoch": 2.87, + "grad_norm": 2.0625, + "learning_rate": 5.344020050306396e-07, + "loss": 0.9212, + "step": 419 + }, + { + "epoch": 2.88, + "grad_norm": 2.125, + "learning_rate": 5.280456036753723e-07, + "loss": 1.0223, + "step": 420 + }, + { + "epoch": 2.89, + "grad_norm": 2.109375, + "learning_rate": 5.217191464471373e-07, + "loss": 0.9886, + "step": 421 + }, + { + "epoch": 2.89, + "grad_norm": 2.140625, + "learning_rate": 5.15422828252444e-07, + "loss": 0.9164, + "step": 422 + }, + { + "epoch": 2.9, + "grad_norm": 2.0625, + "learning_rate": 5.091568430692738e-07, + "loss": 0.9474, + "step": 423 + }, + { + "epoch": 2.91, + "grad_norm": 2.15625, + "learning_rate": 5.029213839411043e-07, + "loss": 0.9271, + "step": 424 + }, + { + "epoch": 2.91, + "grad_norm": 2.0625, + "learning_rate": 4.967166429709606e-07, + "loss": 0.9369, + "step": 425 + }, + { + "epoch": 2.92, + "grad_norm": 2.125, + "learning_rate": 4.905428113154986e-07, + "loss": 0.9415, + "step": 426 + }, + { + "epoch": 2.93, + "grad_norm": 2.078125, + "learning_rate": 4.844000791791147e-07, + "loss": 0.9494, + "step": 427 + }, + { + "epoch": 2.93, + "grad_norm": 2.140625, + "learning_rate": 4.782886358080865e-07, + "loss": 0.8931, + "step": 428 + }, + { + "epoch": 2.94, + "grad_norm": 2.109375, + "learning_rate": 4.7220866948474156e-07, + "loss": 0.922, + "step": 429 + }, + { + "epoch": 2.95, + "grad_norm": 2.109375, + "learning_rate": 4.6616036752165916e-07, + "loss": 0.9282, + "step": 430 + }, + { + "epoch": 2.96, + "grad_norm": 2.109375, + "learning_rate": 4.6014391625589697e-07, + "loss": 0.9364, + "step": 431 + }, + { + "epoch": 2.96, + "grad_norm": 2.140625, + "learning_rate": 4.541595010432501e-07, + "loss": 0.9398, + "step": 432 + } + ], + "logging_steps": 1, + "max_steps": 576, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 72, + "total_flos": 6.374281953222328e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..c674318 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbc304786eceedd5aad7ea6593da6cdd0113664d448a0579cbcb93e1bf257b1c +size 5307