commit 93cf55f1bd6d3f1ebf43eb1d1f24c95ca9583df3 Author: ModelHub XC Date: Sun Apr 12 08:19:54 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: BSC-LT/Checkpoint_4epoch_rag Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2c5702c --- /dev/null +++ b/.gitattributes @@ -0,0 +1,40 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..3d3eb11 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +--- +license: apache-2.0 +datasets: +- projecte-aina/RAG_Multilingual +language: +- es +- en +- ca +library_name: transformers +--- \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..28779df --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "_name_or_path": "/gpfs/projects/bsc88/hf-models/fourth_epoch_bsc_7b_restart_mix1_all_fineweb_from_mix1_lr3e-5_lr3e-6_step68625_hf/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.44.0", + "use_cache": true, + "vocab_size": 256000 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..619b676 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.44.0" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..df4f2ee --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de6cd55ac2cd9af0f8b5bf869e48e1bfc9b897d487fe980fbbbbdd1cba6345ae +size 4982973048 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..2c9900f --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adbc8a286a7f95213925072695848cbc1bef9c9c9215686b8ed62afc4f28900f +size 4995660232 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..7f14f25 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eb79ac135e0333ec3da487bd73779d505090cac9069c61ddd10fcb3206ce67e +size 3460482936 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..feea6e3 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a355b8a5380f68bd5aa5d8fdc4b8c866e63ac5d1ed78d7fa8afcf59448e280 +size 2097152128 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5cbd692 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 15536234496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..46e372a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,34 @@ +{ + "additional_special_tokens": [ + { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..a09d021 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f055d86bd1a344221f15dcc85aaa96b6b80a616445504ef8fb5b74476e8140b4 +size 19092375 diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..3307f8d --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa490e57cebce5cb1a0a5b1a5d3fa4de05aee53dc3a44791f1c3401db44d802d +size 4813274 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..6168846 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,1104 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "5": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "6": { + "content": "<|reserved_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "7": { + "content": "<|reserved_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "8": { + "content": "<|reserved_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "9": { + "content": "<|reserved_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "10": { + "content": "<|reserved_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "11": { + "content": "<|reserved_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "12": { + "content": "<|reserved_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "13": { + "content": "<|reserved_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "14": { + "content": "<|reserved_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "15": { + "content": "<|reserved_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "16": { + "content": "<|reserved_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "17": { + "content": "<|reserved_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "18": { + "content": "<|reserved_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "19": { + "content": "<|reserved_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "20": { + "content": "<|reserved_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "21": { + "content": "<|reserved_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "22": { + "content": "<|reserved_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "23": { + "content": "<|reserved_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "24": { + "content": "<|reserved_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "25": { + "content": "<|reserved_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "26": { + "content": "<|reserved_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "27": { + "content": "<|reserved_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "28": { + "content": "<|reserved_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "29": { + "content": "<|reserved_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "30": { + "content": "<|reserved_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "31": { + "content": "<|reserved_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32": { + "content": "<|reserved_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "33": { + "content": "<|reserved_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "34": { + "content": "<|reserved_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "35": { + "content": "<|reserved_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "36": { + "content": "<|reserved_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "37": { + "content": "<|reserved_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "38": { + "content": "<|reserved_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "39": { + "content": "<|reserved_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "40": { + "content": "<|reserved_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "41": { + "content": "<|reserved_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "42": { + "content": "<|reserved_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "43": { + "content": "<|reserved_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "44": { + "content": "<|reserved_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "45": { + "content": "<|reserved_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "46": { + "content": "<|reserved_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "47": { + "content": "<|reserved_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "48": { + "content": "<|reserved_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "49": { + "content": "<|reserved_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "50": { + "content": "<|reserved_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "51": { + "content": "<|reserved_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "52": { + "content": "<|reserved_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "53": { + "content": "<|reserved_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "54": { + "content": "<|reserved_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "55": { + "content": "<|reserved_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "56": { + "content": "<|reserved_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "57": { + "content": "<|reserved_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "58": { + "content": "<|reserved_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "59": { + "content": "<|reserved_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "60": { + "content": "<|reserved_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "61": { + "content": "<|reserved_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "62": { + "content": "<|reserved_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "63": { + "content": "<|reserved_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64": { + "content": "<|reserved_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65": { + "content": "<|reserved_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "66": { + "content": "<|reserved_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "67": { + "content": "<|reserved_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "68": { + "content": "<|reserved_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "69": { + "content": "<|reserved_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "70": { + "content": "<|reserved_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "71": { + "content": "<|reserved_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "72": { + "content": "<|reserved_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "73": { + "content": "<|reserved_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "74": { + "content": "<|reserved_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "75": { + "content": "<|reserved_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "76": { + "content": "<|reserved_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "77": { + "content": "<|reserved_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "78": { + "content": "<|reserved_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "79": { + "content": "<|reserved_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "80": { + "content": "<|reserved_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "81": { + "content": "<|reserved_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "82": { + "content": "<|reserved_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "83": { + "content": "<|reserved_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "84": { + "content": "<|reserved_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "85": { + "content": "<|reserved_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "86": { + "content": "<|reserved_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "87": { + "content": "<|reserved_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "88": { + "content": "<|reserved_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "89": { + "content": "<|reserved_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "90": { + "content": "<|reserved_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "91": { + "content": "<|reserved_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92": { + "content": "<|reserved_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "93": { + "content": "<|reserved_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "94": { + "content": "<|reserved_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "95": { + "content": "<|reserved_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "96": { + "content": "<|reserved_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "97": { + "content": "<|reserved_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "98": { + "content": "<|reserved_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "99": { + "content": "<|reserved_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "100": { + "content": "<|reserved_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "101": { + "content": "<|reserved_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "102": { + "content": "<|reserved_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "103": { + "content": "<|reserved_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "104": { + "content": "\\r", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": "", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] | trim + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "legacy": true, + "model_max_length": 8192, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..d377263 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2376 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984871406959153, + "eval_steps": 83, + "global_step": 330, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0030257186081694403, + "grad_norm": 28.69647459181808, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1946, + "step": 1 + }, + { + "epoch": 0.006051437216338881, + "grad_norm": 46.535356376461294, + "learning_rate": 4.000000000000001e-06, + "loss": 1.2123, + "step": 2 + }, + { + "epoch": 0.009077155824508321, + "grad_norm": 26.580348193631924, + "learning_rate": 6e-06, + "loss": 1.1556, + "step": 3 + }, + { + "epoch": 0.012102874432677761, + "grad_norm": 42.77534282277984, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0315, + "step": 4 + }, + { + "epoch": 0.015128593040847202, + "grad_norm": 16.275494364561833, + "learning_rate": 1e-05, + "loss": 0.876, + "step": 5 + }, + { + "epoch": 0.018154311649016642, + "grad_norm": 8.17885680137755, + "learning_rate": 1.2e-05, + "loss": 0.8625, + "step": 6 + }, + { + "epoch": 0.02118003025718608, + "grad_norm": 33.71594077895699, + "learning_rate": 1.4e-05, + "loss": 0.8587, + "step": 7 + }, + { + "epoch": 0.024205748865355523, + "grad_norm": 12.10712283931031, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.8204, + "step": 8 + }, + { + "epoch": 0.02723146747352496, + "grad_norm": 14.797023078319215, + "learning_rate": 1.8e-05, + "loss": 0.8517, + "step": 9 + }, + { + "epoch": 0.030257186081694403, + "grad_norm": 5.808283122077986, + "learning_rate": 2e-05, + "loss": 0.8194, + "step": 10 + }, + { + "epoch": 0.03328290468986384, + "grad_norm": 3.6048514956689637, + "learning_rate": 1.999951808959328e-05, + "loss": 0.8072, + "step": 11 + }, + { + "epoch": 0.036308623298033284, + "grad_norm": 3.53002370438567, + "learning_rate": 1.9998072404820648e-05, + "loss": 0.7756, + "step": 12 + }, + { + "epoch": 0.039334341906202726, + "grad_norm": 2.905740802249514, + "learning_rate": 1.9995663085020215e-05, + "loss": 0.7423, + "step": 13 + }, + { + "epoch": 0.04236006051437216, + "grad_norm": 3.4168458120510192, + "learning_rate": 1.9992290362407232e-05, + "loss": 0.7363, + "step": 14 + }, + { + "epoch": 0.0453857791225416, + "grad_norm": 2.761712792781197, + "learning_rate": 1.9987954562051724e-05, + "loss": 0.7546, + "step": 15 + }, + { + "epoch": 0.048411497730711045, + "grad_norm": 3.0818637598025327, + "learning_rate": 1.998265610184716e-05, + "loss": 0.7589, + "step": 16 + }, + { + "epoch": 0.05143721633888049, + "grad_norm": 3.118071353231242, + "learning_rate": 1.997639549247016e-05, + "loss": 0.7511, + "step": 17 + }, + { + "epoch": 0.05446293494704992, + "grad_norm": 3.2183269397087835, + "learning_rate": 1.9969173337331283e-05, + "loss": 0.7413, + "step": 18 + }, + { + "epoch": 0.057488653555219364, + "grad_norm": 4.661245042116266, + "learning_rate": 1.9960990332516875e-05, + "loss": 0.769, + "step": 19 + }, + { + "epoch": 0.060514372163388806, + "grad_norm": 3.7325903495658737, + "learning_rate": 1.995184726672197e-05, + "loss": 0.7762, + "step": 20 + }, + { + "epoch": 0.06354009077155824, + "grad_norm": 4.816863844430095, + "learning_rate": 1.9941745021174284e-05, + "loss": 0.7492, + "step": 21 + }, + { + "epoch": 0.06656580937972768, + "grad_norm": 4.136202257254184, + "learning_rate": 1.9930684569549265e-05, + "loss": 0.7481, + "step": 22 + }, + { + "epoch": 0.06959152798789713, + "grad_norm": 3.9111680915408664, + "learning_rate": 1.991866697787626e-05, + "loss": 0.75, + "step": 23 + }, + { + "epoch": 0.07261724659606657, + "grad_norm": 3.0125682686675646, + "learning_rate": 1.990569340443577e-05, + "loss": 0.7363, + "step": 24 + }, + { + "epoch": 0.07564296520423601, + "grad_norm": 2.9756894328848924, + "learning_rate": 1.989176509964781e-05, + "loss": 0.7089, + "step": 25 + }, + { + "epoch": 0.07866868381240545, + "grad_norm": 2.927900220452882, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.7189, + "step": 26 + }, + { + "epoch": 0.08169440242057488, + "grad_norm": 3.1957652828807026, + "learning_rate": 1.9861049757675087e-05, + "loss": 0.7228, + "step": 27 + }, + { + "epoch": 0.08472012102874432, + "grad_norm": 3.0414085114978477, + "learning_rate": 1.9844265680898917e-05, + "loss": 0.7329, + "step": 28 + }, + { + "epoch": 0.08774583963691376, + "grad_norm": 2.9039201895797597, + "learning_rate": 1.982653279330712e-05, + "loss": 0.7043, + "step": 29 + }, + { + "epoch": 0.0907715582450832, + "grad_norm": 3.1835496461858503, + "learning_rate": 1.9807852804032306e-05, + "loss": 0.699, + "step": 30 + }, + { + "epoch": 0.09379727685325265, + "grad_norm": 2.6119206153518224, + "learning_rate": 1.9788227513490724e-05, + "loss": 0.7274, + "step": 31 + }, + { + "epoch": 0.09682299546142209, + "grad_norm": 2.389474693597407, + "learning_rate": 1.9767658813208725e-05, + "loss": 0.7027, + "step": 32 + }, + { + "epoch": 0.09984871406959153, + "grad_norm": 3.6204577668426294, + "learning_rate": 1.974614868564045e-05, + "loss": 0.6499, + "step": 33 + }, + { + "epoch": 0.10287443267776097, + "grad_norm": 2.7807778849597056, + "learning_rate": 1.9723699203976768e-05, + "loss": 0.7215, + "step": 34 + }, + { + "epoch": 0.1059001512859304, + "grad_norm": 3.2488604572997435, + "learning_rate": 1.9700312531945444e-05, + "loss": 0.7381, + "step": 35 + }, + { + "epoch": 0.10892586989409984, + "grad_norm": 2.8180508499440635, + "learning_rate": 1.96759909236026e-05, + "loss": 0.7036, + "step": 36 + }, + { + "epoch": 0.11195158850226929, + "grad_norm": 2.8951625696679515, + "learning_rate": 1.9650736723115476e-05, + "loss": 0.6799, + "step": 37 + }, + { + "epoch": 0.11497730711043873, + "grad_norm": 2.8052448064970017, + "learning_rate": 1.9624552364536472e-05, + "loss": 0.7312, + "step": 38 + }, + { + "epoch": 0.11800302571860817, + "grad_norm": 2.658771948223404, + "learning_rate": 1.9597440371568576e-05, + "loss": 0.7064, + "step": 39 + }, + { + "epoch": 0.12102874432677761, + "grad_norm": 33.37825731362895, + "learning_rate": 1.956940335732209e-05, + "loss": 0.7025, + "step": 40 + }, + { + "epoch": 0.12405446293494705, + "grad_norm": 2.5519953709353147, + "learning_rate": 1.9540444024062807e-05, + "loss": 0.7037, + "step": 41 + }, + { + "epoch": 0.12708018154311648, + "grad_norm": 2.4926261964588714, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.7398, + "step": 42 + }, + { + "epoch": 0.13010590015128592, + "grad_norm": 3.17581775275039, + "learning_rate": 1.9479769653775107e-05, + "loss": 0.6861, + "step": 43 + }, + { + "epoch": 0.13313161875945537, + "grad_norm": 2.8350832824191015, + "learning_rate": 1.944806046466878e-05, + "loss": 0.7409, + "step": 44 + }, + { + "epoch": 0.1361573373676248, + "grad_norm": 3.013396547202102, + "learning_rate": 1.941544065183021e-05, + "loss": 0.6992, + "step": 45 + }, + { + "epoch": 0.13918305597579425, + "grad_norm": 2.4991790227079274, + "learning_rate": 1.9381913359224844e-05, + "loss": 0.6754, + "step": 46 + }, + { + "epoch": 0.1422087745839637, + "grad_norm": 2.371137405420411, + "learning_rate": 1.9347481818282927e-05, + "loss": 0.6601, + "step": 47 + }, + { + "epoch": 0.14523449319213314, + "grad_norm": 2.5608484924777684, + "learning_rate": 1.9312149347588035e-05, + "loss": 0.7101, + "step": 48 + }, + { + "epoch": 0.14826021180030258, + "grad_norm": 2.4369577435382603, + "learning_rate": 1.9275919352557242e-05, + "loss": 0.7177, + "step": 49 + }, + { + "epoch": 0.15128593040847202, + "grad_norm": 2.1725421335195834, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.7132, + "step": 50 + }, + { + "epoch": 0.15431164901664146, + "grad_norm": 2.8175635584619747, + "learning_rate": 1.920078084334595e-05, + "loss": 0.6798, + "step": 51 + }, + { + "epoch": 0.1573373676248109, + "grad_norm": 7.970119121422135, + "learning_rate": 1.916187957117136e-05, + "loss": 0.6955, + "step": 52 + }, + { + "epoch": 0.16036308623298035, + "grad_norm": 3.3508509400766857, + "learning_rate": 1.9122095257974676e-05, + "loss": 0.6843, + "step": 53 + }, + { + "epoch": 0.16338880484114976, + "grad_norm": 3.375521052312449, + "learning_rate": 1.9081431738250815e-05, + "loss": 0.691, + "step": 54 + }, + { + "epoch": 0.1664145234493192, + "grad_norm": 4.070807581379976, + "learning_rate": 1.9039892931234434e-05, + "loss": 0.6731, + "step": 55 + }, + { + "epoch": 0.16944024205748864, + "grad_norm": 2.268801481261902, + "learning_rate": 1.8997482840522218e-05, + "loss": 0.677, + "step": 56 + }, + { + "epoch": 0.17246596066565809, + "grad_norm": 2.457387820783625, + "learning_rate": 1.895420555368697e-05, + "loss": 0.6922, + "step": 57 + }, + { + "epoch": 0.17549167927382753, + "grad_norm": 2.0435275180676875, + "learning_rate": 1.891006524188368e-05, + "loss": 0.6752, + "step": 58 + }, + { + "epoch": 0.17851739788199697, + "grad_norm": 2.2528990754516385, + "learning_rate": 1.8865066159447468e-05, + "loss": 0.7021, + "step": 59 + }, + { + "epoch": 0.1815431164901664, + "grad_norm": 2.8238943942406105, + "learning_rate": 1.881921264348355e-05, + "loss": 0.6953, + "step": 60 + }, + { + "epoch": 0.18456883509833585, + "grad_norm": 1.9948330242242918, + "learning_rate": 1.8772509113449243e-05, + "loss": 0.6248, + "step": 61 + }, + { + "epoch": 0.1875945537065053, + "grad_norm": 3.7523229521508163, + "learning_rate": 1.8724960070727974e-05, + "loss": 0.7073, + "step": 62 + }, + { + "epoch": 0.19062027231467474, + "grad_norm": 2.1680071456762455, + "learning_rate": 1.8676570098195443e-05, + "loss": 0.6827, + "step": 63 + }, + { + "epoch": 0.19364599092284418, + "grad_norm": 2.042062162285346, + "learning_rate": 1.862734385977792e-05, + "loss": 0.69, + "step": 64 + }, + { + "epoch": 0.19667170953101362, + "grad_norm": 2.2165386400934515, + "learning_rate": 1.8577286100002723e-05, + "loss": 0.6922, + "step": 65 + }, + { + "epoch": 0.19969742813918306, + "grad_norm": 2.2231810005372243, + "learning_rate": 1.8526401643540924e-05, + "loss": 0.7022, + "step": 66 + }, + { + "epoch": 0.2027231467473525, + "grad_norm": 2.423903019156615, + "learning_rate": 1.8474695394742345e-05, + "loss": 0.7159, + "step": 67 + }, + { + "epoch": 0.20574886535552195, + "grad_norm": 2.198025834687471, + "learning_rate": 1.8422172337162865e-05, + "loss": 0.7089, + "step": 68 + }, + { + "epoch": 0.2087745839636914, + "grad_norm": 2.0031914139514, + "learning_rate": 1.8368837533084092e-05, + "loss": 0.6897, + "step": 69 + }, + { + "epoch": 0.2118003025718608, + "grad_norm": 3.0386359279568405, + "learning_rate": 1.8314696123025456e-05, + "loss": 0.6727, + "step": 70 + }, + { + "epoch": 0.21482602118003025, + "grad_norm": 2.261529681141294, + "learning_rate": 1.825975332524873e-05, + "loss": 0.7021, + "step": 71 + }, + { + "epoch": 0.2178517397881997, + "grad_norm": 2.0495889362586808, + "learning_rate": 1.8204014435255136e-05, + "loss": 0.6882, + "step": 72 + }, + { + "epoch": 0.22087745839636913, + "grad_norm": 2.296340383646409, + "learning_rate": 1.8147484825274895e-05, + "loss": 0.6454, + "step": 73 + }, + { + "epoch": 0.22390317700453857, + "grad_norm": 2.0304893758901197, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.6804, + "step": 74 + }, + { + "epoch": 0.22692889561270801, + "grad_norm": 2.5133831251206344, + "learning_rate": 1.803207531480645e-05, + "loss": 0.7207, + "step": 75 + }, + { + "epoch": 0.22995461422087746, + "grad_norm": 2.1447445433471093, + "learning_rate": 1.797320653772707e-05, + "loss": 0.664, + "step": 76 + }, + { + "epoch": 0.2329803328290469, + "grad_norm": 2.028105828542751, + "learning_rate": 1.7913569286406606e-05, + "loss": 0.7106, + "step": 77 + }, + { + "epoch": 0.23600605143721634, + "grad_norm": 1.905594572089945, + "learning_rate": 1.785316930880745e-05, + "loss": 0.7199, + "step": 78 + }, + { + "epoch": 0.23903177004538578, + "grad_norm": 2.0681646724895195, + "learning_rate": 1.779201242640517e-05, + "loss": 0.6796, + "step": 79 + }, + { + "epoch": 0.24205748865355523, + "grad_norm": 1.8836218597127485, + "learning_rate": 1.773010453362737e-05, + "loss": 0.6715, + "step": 80 + }, + { + "epoch": 0.24508320726172467, + "grad_norm": 4.596541463949189, + "learning_rate": 1.7667451597285617e-05, + "loss": 0.6992, + "step": 81 + }, + { + "epoch": 0.2481089258698941, + "grad_norm": 2.0369793884589766, + "learning_rate": 1.7604059656000313e-05, + "loss": 0.7, + "step": 82 + }, + { + "epoch": 0.25113464447806355, + "grad_norm": 2.338455245767599, + "learning_rate": 1.7539934819618696e-05, + "loss": 0.6676, + "step": 83 + }, + { + "epoch": 0.25113464447806355, + "eval_loss": 0.6752901077270508, + "eval_runtime": 93.8585, + "eval_samples_per_second": 45.068, + "eval_steps_per_second": 0.714, + "step": 83 + }, + { + "epoch": 0.25416036308623297, + "grad_norm": 2.428048931609376, + "learning_rate": 1.747508326862597e-05, + "loss": 0.678, + "step": 84 + }, + { + "epoch": 0.25718608169440244, + "grad_norm": 2.0275496963886424, + "learning_rate": 1.7409511253549592e-05, + "loss": 0.695, + "step": 85 + }, + { + "epoch": 0.26021180030257185, + "grad_norm": 2.0508833034616156, + "learning_rate": 1.7343225094356857e-05, + "loss": 0.6928, + "step": 86 + }, + { + "epoch": 0.2632375189107413, + "grad_norm": 1.906095713213679, + "learning_rate": 1.727623117984575e-05, + "loss": 0.6578, + "step": 87 + }, + { + "epoch": 0.26626323751891073, + "grad_norm": 1.9381260380532637, + "learning_rate": 1.720853596702919e-05, + "loss": 0.6865, + "step": 88 + }, + { + "epoch": 0.2692889561270802, + "grad_norm": 2.044946626820364, + "learning_rate": 1.7140145980512684e-05, + "loss": 0.6637, + "step": 89 + }, + { + "epoch": 0.2723146747352496, + "grad_norm": 2.2577464480076745, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.705, + "step": 90 + }, + { + "epoch": 0.2753403933434191, + "grad_norm": 2.073404675641156, + "learning_rate": 1.7001308118985237e-05, + "loss": 0.6879, + "step": 91 + }, + { + "epoch": 0.2783661119515885, + "grad_norm": 2.0173586400260413, + "learning_rate": 1.6930873625456362e-05, + "loss": 0.6843, + "step": 92 + }, + { + "epoch": 0.2813918305597579, + "grad_norm": 2.0019518111223538, + "learning_rate": 1.685977111990193e-05, + "loss": 0.685, + "step": 93 + }, + { + "epoch": 0.2844175491679274, + "grad_norm": 2.439404052212247, + "learning_rate": 1.678800745532942e-05, + "loss": 0.6769, + "step": 94 + }, + { + "epoch": 0.2874432677760968, + "grad_norm": 2.000348467180862, + "learning_rate": 1.6715589548470187e-05, + "loss": 0.6792, + "step": 95 + }, + { + "epoch": 0.29046898638426627, + "grad_norm": 2.0229640481669877, + "learning_rate": 1.664252437911282e-05, + "loss": 0.7093, + "step": 96 + }, + { + "epoch": 0.2934947049924357, + "grad_norm": 1.9446179261078531, + "learning_rate": 1.6568818989430416e-05, + "loss": 0.6671, + "step": 97 + }, + { + "epoch": 0.29652042360060515, + "grad_norm": 2.088319167066672, + "learning_rate": 1.6494480483301836e-05, + "loss": 0.6709, + "step": 98 + }, + { + "epoch": 0.29954614220877457, + "grad_norm": 2.0371738606680427, + "learning_rate": 1.641951602562703e-05, + "loss": 0.6565, + "step": 99 + }, + { + "epoch": 0.30257186081694404, + "grad_norm": 2.039051091974588, + "learning_rate": 1.6343932841636455e-05, + "loss": 0.6774, + "step": 100 + }, + { + "epoch": 0.30559757942511345, + "grad_norm": 1.89072521062054, + "learning_rate": 1.6267738216194698e-05, + "loss": 0.6855, + "step": 101 + }, + { + "epoch": 0.3086232980332829, + "grad_norm": 1.9373416372844936, + "learning_rate": 1.6190939493098344e-05, + "loss": 0.6613, + "step": 102 + }, + { + "epoch": 0.31164901664145234, + "grad_norm": 2.4850609752400095, + "learning_rate": 1.6113544074368166e-05, + "loss": 0.6693, + "step": 103 + }, + { + "epoch": 0.3146747352496218, + "grad_norm": 1.8893955211292324, + "learning_rate": 1.6035559419535714e-05, + "loss": 0.6769, + "step": 104 + }, + { + "epoch": 0.3177004538577912, + "grad_norm": 1.9633433755541898, + "learning_rate": 1.5956993044924334e-05, + "loss": 0.6623, + "step": 105 + }, + { + "epoch": 0.3207261724659607, + "grad_norm": 2.004985996706271, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.641, + "step": 106 + }, + { + "epoch": 0.3237518910741301, + "grad_norm": 1.8629977634675294, + "learning_rate": 1.579814548126514e-05, + "loss": 0.6458, + "step": 107 + }, + { + "epoch": 0.3267776096822995, + "grad_norm": 1.9494867289539974, + "learning_rate": 1.5717879602276123e-05, + "loss": 0.6418, + "step": 108 + }, + { + "epoch": 0.329803328290469, + "grad_norm": 1.8622891514920465, + "learning_rate": 1.5637062622150168e-05, + "loss": 0.6313, + "step": 109 + }, + { + "epoch": 0.3328290468986384, + "grad_norm": 1.9827293450600925, + "learning_rate": 1.5555702330196024e-05, + "loss": 0.6712, + "step": 110 + }, + { + "epoch": 0.3358547655068079, + "grad_norm": 2.4176703750653386, + "learning_rate": 1.547380656808797e-05, + "loss": 0.654, + "step": 111 + }, + { + "epoch": 0.3388804841149773, + "grad_norm": 1.9338933323273004, + "learning_rate": 1.5391383229110005e-05, + "loss": 0.6856, + "step": 112 + }, + { + "epoch": 0.34190620272314676, + "grad_norm": 2.0232362850746797, + "learning_rate": 1.5308440257395095e-05, + "loss": 0.6765, + "step": 113 + }, + { + "epoch": 0.34493192133131617, + "grad_norm": 1.9568056443475894, + "learning_rate": 1.5224985647159489e-05, + "loss": 0.676, + "step": 114 + }, + { + "epoch": 0.34795763993948564, + "grad_norm": 1.8861676872802784, + "learning_rate": 1.5141027441932217e-05, + "loss": 0.6497, + "step": 115 + }, + { + "epoch": 0.35098335854765506, + "grad_norm": 2.2756500221824787, + "learning_rate": 1.5056573733779848e-05, + "loss": 0.6743, + "step": 116 + }, + { + "epoch": 0.3540090771558245, + "grad_norm": 1.9139552054178928, + "learning_rate": 1.4971632662526545e-05, + "loss": 0.6471, + "step": 117 + }, + { + "epoch": 0.35703479576399394, + "grad_norm": 1.945783892355681, + "learning_rate": 1.4886212414969551e-05, + "loss": 0.6616, + "step": 118 + }, + { + "epoch": 0.3600605143721634, + "grad_norm": 2.3636500104137808, + "learning_rate": 1.4800321224090114e-05, + "loss": 0.6661, + "step": 119 + }, + { + "epoch": 0.3630862329803328, + "grad_norm": 1.9824827085817427, + "learning_rate": 1.4713967368259981e-05, + "loss": 0.6713, + "step": 120 + }, + { + "epoch": 0.3661119515885023, + "grad_norm": 2.215021731887088, + "learning_rate": 1.4627159170443504e-05, + "loss": 0.6658, + "step": 121 + }, + { + "epoch": 0.3691376701966717, + "grad_norm": 10.527580401090502, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.6626, + "step": 122 + }, + { + "epoch": 0.3721633888048411, + "grad_norm": 2.649768108207238, + "learning_rate": 1.4452213258854684e-05, + "loss": 0.6578, + "step": 123 + }, + { + "epoch": 0.3751891074130106, + "grad_norm": 2.060396896527891, + "learning_rate": 1.436409240673342e-05, + "loss": 0.6584, + "step": 124 + }, + { + "epoch": 0.37821482602118, + "grad_norm": 2.012679002194894, + "learning_rate": 1.4275550934302822e-05, + "loss": 0.6953, + "step": 125 + }, + { + "epoch": 0.3812405446293495, + "grad_norm": 9.375765884143409, + "learning_rate": 1.4186597375374283e-05, + "loss": 0.6594, + "step": 126 + }, + { + "epoch": 0.3842662632375189, + "grad_norm": 1.9968667010149235, + "learning_rate": 1.4097240303476955e-05, + "loss": 0.6657, + "step": 127 + }, + { + "epoch": 0.38729198184568836, + "grad_norm": 1.9638652973708315, + "learning_rate": 1.4007488331031409e-05, + "loss": 0.6402, + "step": 128 + }, + { + "epoch": 0.3903177004538578, + "grad_norm": 2.6974851402436326, + "learning_rate": 1.391735010851956e-05, + "loss": 0.6999, + "step": 129 + }, + { + "epoch": 0.39334341906202724, + "grad_norm": 2.3946120322541145, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.6929, + "step": 130 + }, + { + "epoch": 0.39636913767019666, + "grad_norm": 2.606665967226635, + "learning_rate": 1.3735949700525164e-05, + "loss": 0.6406, + "step": 131 + }, + { + "epoch": 0.39939485627836613, + "grad_norm": 125.89612873667855, + "learning_rate": 1.3644704998791501e-05, + "loss": 0.6678, + "step": 132 + }, + { + "epoch": 0.40242057488653554, + "grad_norm": 3.514135273140995, + "learning_rate": 1.3553109012804162e-05, + "loss": 0.6687, + "step": 133 + }, + { + "epoch": 0.405446293494705, + "grad_norm": 4.240922776003794, + "learning_rate": 1.346117057077493e-05, + "loss": 0.658, + "step": 134 + }, + { + "epoch": 0.4084720121028744, + "grad_norm": 2.1855195011412922, + "learning_rate": 1.3368898533922202e-05, + "loss": 0.6677, + "step": 135 + }, + { + "epoch": 0.4114977307110439, + "grad_norm": 2.463546469392426, + "learning_rate": 1.3276301795616937e-05, + "loss": 0.6627, + "step": 136 + }, + { + "epoch": 0.4145234493192133, + "grad_norm": 2.133545623738071, + "learning_rate": 1.3183389280525497e-05, + "loss": 0.6928, + "step": 137 + }, + { + "epoch": 0.4175491679273828, + "grad_norm": 2.535405634645763, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.644, + "step": 138 + }, + { + "epoch": 0.4205748865355522, + "grad_norm": 2.1936138643866223, + "learning_rate": 1.2996652769962567e-05, + "loss": 0.686, + "step": 139 + }, + { + "epoch": 0.4236006051437216, + "grad_norm": 1.9156629913412506, + "learning_rate": 1.2902846772544625e-05, + "loss": 0.6528, + "step": 140 + }, + { + "epoch": 0.4266263237518911, + "grad_norm": 1.902537792043501, + "learning_rate": 1.2808760992712923e-05, + "loss": 0.6542, + "step": 141 + }, + { + "epoch": 0.4296520423600605, + "grad_norm": 2.0509718747044197, + "learning_rate": 1.2714404498650743e-05, + "loss": 0.6259, + "step": 142 + }, + { + "epoch": 0.43267776096822996, + "grad_norm": 3.364345349502592, + "learning_rate": 1.2619786384633374e-05, + "loss": 0.6699, + "step": 143 + }, + { + "epoch": 0.4357034795763994, + "grad_norm": 3.6100731074556656, + "learning_rate": 1.252491577015158e-05, + "loss": 0.6583, + "step": 144 + }, + { + "epoch": 0.43872919818456885, + "grad_norm": 1.8986778970529314, + "learning_rate": 1.242980179903264e-05, + "loss": 0.6519, + "step": 145 + }, + { + "epoch": 0.44175491679273826, + "grad_norm": 2.0039061587882623, + "learning_rate": 1.2334453638559057e-05, + "loss": 0.6701, + "step": 146 + }, + { + "epoch": 0.44478063540090773, + "grad_norm": 2.0871498766669467, + "learning_rate": 1.2238880478584987e-05, + "loss": 0.6336, + "step": 147 + }, + { + "epoch": 0.44780635400907715, + "grad_norm": 2.260892934011929, + "learning_rate": 1.2143091530650508e-05, + "loss": 0.6655, + "step": 148 + }, + { + "epoch": 0.4508320726172466, + "grad_norm": 1.9595683172138556, + "learning_rate": 1.2047096027093798e-05, + "loss": 0.6342, + "step": 149 + }, + { + "epoch": 0.45385779122541603, + "grad_norm": 1.8964347487859918, + "learning_rate": 1.1950903220161286e-05, + "loss": 0.6447, + "step": 150 + }, + { + "epoch": 0.4568835098335855, + "grad_norm": 2.0465943957039325, + "learning_rate": 1.185452238111591e-05, + "loss": 0.6398, + "step": 151 + }, + { + "epoch": 0.4599092284417549, + "grad_norm": 2.2749337860012186, + "learning_rate": 1.1757962799343548e-05, + "loss": 0.6228, + "step": 152 + }, + { + "epoch": 0.4629349470499244, + "grad_norm": 1.9360234732478145, + "learning_rate": 1.1661233781457655e-05, + "loss": 0.6271, + "step": 153 + }, + { + "epoch": 0.4659606656580938, + "grad_norm": 6.391362683845854, + "learning_rate": 1.156434465040231e-05, + "loss": 0.6102, + "step": 154 + }, + { + "epoch": 0.4689863842662632, + "grad_norm": 2.6964140732000597, + "learning_rate": 1.1467304744553618e-05, + "loss": 0.6381, + "step": 155 + }, + { + "epoch": 0.4720121028744327, + "grad_norm": 2.040478374694881, + "learning_rate": 1.1370123416819683e-05, + "loss": 0.6891, + "step": 156 + }, + { + "epoch": 0.4750378214826021, + "grad_norm": 2.5024088180061996, + "learning_rate": 1.1272810033739134e-05, + "loss": 0.68, + "step": 157 + }, + { + "epoch": 0.47806354009077157, + "grad_norm": 5.034110912343636, + "learning_rate": 1.1175373974578378e-05, + "loss": 0.6357, + "step": 158 + }, + { + "epoch": 0.481089258698941, + "grad_norm": 1.8088504044607232, + "learning_rate": 1.1077824630427593e-05, + "loss": 0.6596, + "step": 159 + }, + { + "epoch": 0.48411497730711045, + "grad_norm": 1.9005713985258568, + "learning_rate": 1.098017140329561e-05, + "loss": 0.6198, + "step": 160 + }, + { + "epoch": 0.48714069591527986, + "grad_norm": 1.9342848652449698, + "learning_rate": 1.0882423705203698e-05, + "loss": 0.6396, + "step": 161 + }, + { + "epoch": 0.49016641452344933, + "grad_norm": 2.829885337118367, + "learning_rate": 1.0784590957278452e-05, + "loss": 0.6673, + "step": 162 + }, + { + "epoch": 0.49319213313161875, + "grad_norm": 1.9563381110322227, + "learning_rate": 1.0686682588843737e-05, + "loss": 0.6378, + "step": 163 + }, + { + "epoch": 0.4962178517397882, + "grad_norm": 2.456309042364747, + "learning_rate": 1.058870803651189e-05, + "loss": 0.6297, + "step": 164 + }, + { + "epoch": 0.49924357034795763, + "grad_norm": 1.929682148235842, + "learning_rate": 1.0490676743274181e-05, + "loss": 0.6579, + "step": 165 + }, + { + "epoch": 0.5022692889561271, + "grad_norm": 2.1866586377485624, + "learning_rate": 1.0392598157590687e-05, + "loss": 0.6704, + "step": 166 + }, + { + "epoch": 0.5022692889561271, + "eval_loss": 0.6389465928077698, + "eval_runtime": 89.0761, + "eval_samples_per_second": 47.488, + "eval_steps_per_second": 0.752, + "step": 166 + }, + { + "epoch": 0.5052950075642966, + "grad_norm": 2.7293134459227932, + "learning_rate": 1.0294481732479635e-05, + "loss": 0.6475, + "step": 167 + }, + { + "epoch": 0.5083207261724659, + "grad_norm": 2.1860750738467973, + "learning_rate": 1.0196336924606282e-05, + "loss": 0.6845, + "step": 168 + }, + { + "epoch": 0.5113464447806354, + "grad_norm": 1.9577090354755105, + "learning_rate": 1.0098173193371498e-05, + "loss": 0.6386, + "step": 169 + }, + { + "epoch": 0.5143721633888049, + "grad_norm": 1.7813398109730993, + "learning_rate": 1e-05, + "loss": 0.6397, + "step": 170 + }, + { + "epoch": 0.5173978819969742, + "grad_norm": 2.044986488285846, + "learning_rate": 9.901826806628505e-06, + "loss": 0.6212, + "step": 171 + }, + { + "epoch": 0.5204236006051437, + "grad_norm": 1.769516031150374, + "learning_rate": 9.80366307539372e-06, + "loss": 0.6136, + "step": 172 + }, + { + "epoch": 0.5234493192133132, + "grad_norm": 6.456975961949371, + "learning_rate": 9.705518267520369e-06, + "loss": 0.6348, + "step": 173 + }, + { + "epoch": 0.5264750378214826, + "grad_norm": 2.5315009637312036, + "learning_rate": 9.607401842409318e-06, + "loss": 0.6428, + "step": 174 + }, + { + "epoch": 0.529500756429652, + "grad_norm": 2.988285644385764, + "learning_rate": 9.50932325672582e-06, + "loss": 0.6335, + "step": 175 + }, + { + "epoch": 0.5325264750378215, + "grad_norm": 1.914633603140675, + "learning_rate": 9.41129196348811e-06, + "loss": 0.6446, + "step": 176 + }, + { + "epoch": 0.5355521936459909, + "grad_norm": 2.1586892262096753, + "learning_rate": 9.313317411156265e-06, + "loss": 0.6875, + "step": 177 + }, + { + "epoch": 0.5385779122541604, + "grad_norm": 2.09973624901808, + "learning_rate": 9.215409042721553e-06, + "loss": 0.6286, + "step": 178 + }, + { + "epoch": 0.5416036308623298, + "grad_norm": 2.1121991469118373, + "learning_rate": 9.117576294796307e-06, + "loss": 0.6086, + "step": 179 + }, + { + "epoch": 0.5446293494704992, + "grad_norm": 2.970101791108074, + "learning_rate": 9.019828596704394e-06, + "loss": 0.6201, + "step": 180 + }, + { + "epoch": 0.5476550680786687, + "grad_norm": 2.797557758822073, + "learning_rate": 8.922175369572407e-06, + "loss": 0.666, + "step": 181 + }, + { + "epoch": 0.5506807866868382, + "grad_norm": 1.9321335437019027, + "learning_rate": 8.824626025421625e-06, + "loss": 0.5977, + "step": 182 + }, + { + "epoch": 0.5537065052950075, + "grad_norm": 1.971967748577256, + "learning_rate": 8.72718996626087e-06, + "loss": 0.6201, + "step": 183 + }, + { + "epoch": 0.556732223903177, + "grad_norm": 3.0984688964644103, + "learning_rate": 8.629876583180322e-06, + "loss": 0.6142, + "step": 184 + }, + { + "epoch": 0.5597579425113465, + "grad_norm": 2.623594180878439, + "learning_rate": 8.532695255446384e-06, + "loss": 0.6123, + "step": 185 + }, + { + "epoch": 0.5627836611195158, + "grad_norm": 1.9333898094978612, + "learning_rate": 8.43565534959769e-06, + "loss": 0.6169, + "step": 186 + }, + { + "epoch": 0.5658093797276853, + "grad_norm": 1.765583174562486, + "learning_rate": 8.338766218542348e-06, + "loss": 0.5971, + "step": 187 + }, + { + "epoch": 0.5688350983358548, + "grad_norm": 2.0010679040329427, + "learning_rate": 8.242037200656455e-06, + "loss": 0.6574, + "step": 188 + }, + { + "epoch": 0.5718608169440242, + "grad_norm": 4.98264842883333, + "learning_rate": 8.145477618884092e-06, + "loss": 0.6182, + "step": 189 + }, + { + "epoch": 0.5748865355521936, + "grad_norm": 1.8354394720905292, + "learning_rate": 8.04909677983872e-06, + "loss": 0.6295, + "step": 190 + }, + { + "epoch": 0.5779122541603631, + "grad_norm": 1.7610448288238965, + "learning_rate": 7.952903972906205e-06, + "loss": 0.6034, + "step": 191 + }, + { + "epoch": 0.5809379727685325, + "grad_norm": 1.9152183775112974, + "learning_rate": 7.856908469349495e-06, + "loss": 0.6349, + "step": 192 + }, + { + "epoch": 0.583963691376702, + "grad_norm": 1.975212156731082, + "learning_rate": 7.761119521415017e-06, + "loss": 0.6311, + "step": 193 + }, + { + "epoch": 0.5869894099848714, + "grad_norm": 13.200664607058753, + "learning_rate": 7.66554636144095e-06, + "loss": 0.6415, + "step": 194 + }, + { + "epoch": 0.5900151285930408, + "grad_norm": 2.5875429359472633, + "learning_rate": 7.570198200967363e-06, + "loss": 0.6507, + "step": 195 + }, + { + "epoch": 0.5930408472012103, + "grad_norm": 2.593631327870592, + "learning_rate": 7.4750842298484205e-06, + "loss": 0.6313, + "step": 196 + }, + { + "epoch": 0.5960665658093798, + "grad_norm": 2.276110525495816, + "learning_rate": 7.380213615366627e-06, + "loss": 0.6519, + "step": 197 + }, + { + "epoch": 0.5990922844175491, + "grad_norm": 1.8935451273928605, + "learning_rate": 7.285595501349259e-06, + "loss": 0.6174, + "step": 198 + }, + { + "epoch": 0.6021180030257186, + "grad_norm": 2.2562062680826993, + "learning_rate": 7.191239007287082e-06, + "loss": 0.6373, + "step": 199 + }, + { + "epoch": 0.6051437216338881, + "grad_norm": 2.0887824983462786, + "learning_rate": 7.097153227455379e-06, + "loss": 0.6684, + "step": 200 + }, + { + "epoch": 0.6081694402420574, + "grad_norm": 2.067806827035956, + "learning_rate": 7.003347230037434e-06, + "loss": 0.6458, + "step": 201 + }, + { + "epoch": 0.6111951588502269, + "grad_norm": 2.2400710241102555, + "learning_rate": 6.909830056250527e-06, + "loss": 0.6224, + "step": 202 + }, + { + "epoch": 0.6142208774583964, + "grad_norm": 3.530109217003973, + "learning_rate": 6.816610719474503e-06, + "loss": 0.6029, + "step": 203 + }, + { + "epoch": 0.6172465960665658, + "grad_norm": 1.9437166356375082, + "learning_rate": 6.723698204383067e-06, + "loss": 0.6237, + "step": 204 + }, + { + "epoch": 0.6202723146747352, + "grad_norm": 4.601296861932386, + "learning_rate": 6.631101466077801e-06, + "loss": 0.6101, + "step": 205 + }, + { + "epoch": 0.6232980332829047, + "grad_norm": 1.7734035043229037, + "learning_rate": 6.538829429225068e-06, + "loss": 0.6144, + "step": 206 + }, + { + "epoch": 0.6263237518910741, + "grad_norm": 3.3875264361334065, + "learning_rate": 6.446890987195842e-06, + "loss": 0.6263, + "step": 207 + }, + { + "epoch": 0.6293494704992436, + "grad_norm": 1.837402458317683, + "learning_rate": 6.355295001208504e-06, + "loss": 0.651, + "step": 208 + }, + { + "epoch": 0.632375189107413, + "grad_norm": 1.8127137816199999, + "learning_rate": 6.2640502994748375e-06, + "loss": 0.6031, + "step": 209 + }, + { + "epoch": 0.6354009077155824, + "grad_norm": 2.222119352037505, + "learning_rate": 6.173165676349103e-06, + "loss": 0.6341, + "step": 210 + }, + { + "epoch": 0.6384266263237519, + "grad_norm": 2.0517413682681878, + "learning_rate": 6.082649891480441e-06, + "loss": 0.6012, + "step": 211 + }, + { + "epoch": 0.6414523449319214, + "grad_norm": 1.743000217931073, + "learning_rate": 5.9925116689685925e-06, + "loss": 0.612, + "step": 212 + }, + { + "epoch": 0.6444780635400907, + "grad_norm": 1.893932549286191, + "learning_rate": 5.902759696523046e-06, + "loss": 0.6139, + "step": 213 + }, + { + "epoch": 0.6475037821482602, + "grad_norm": 1.7655154916634883, + "learning_rate": 5.813402624625722e-06, + "loss": 0.6045, + "step": 214 + }, + { + "epoch": 0.6505295007564297, + "grad_norm": 2.3122722500829185, + "learning_rate": 5.724449065697182e-06, + "loss": 0.6283, + "step": 215 + }, + { + "epoch": 0.653555219364599, + "grad_norm": 2.07646899358201, + "learning_rate": 5.635907593266578e-06, + "loss": 0.6011, + "step": 216 + }, + { + "epoch": 0.6565809379727685, + "grad_norm": 1.974337195806504, + "learning_rate": 5.54778674114532e-06, + "loss": 0.6322, + "step": 217 + }, + { + "epoch": 0.659606656580938, + "grad_norm": 1.7797760085297933, + "learning_rate": 5.460095002604533e-06, + "loss": 0.6271, + "step": 218 + }, + { + "epoch": 0.6626323751891074, + "grad_norm": 1.8906339851795884, + "learning_rate": 5.3728408295565e-06, + "loss": 0.6392, + "step": 219 + }, + { + "epoch": 0.6656580937972768, + "grad_norm": 1.7408037225823225, + "learning_rate": 5.286032631740023e-06, + "loss": 0.6153, + "step": 220 + }, + { + "epoch": 0.6686838124054463, + "grad_norm": 1.7150384120846642, + "learning_rate": 5.199678775909889e-06, + "loss": 0.5994, + "step": 221 + }, + { + "epoch": 0.6717095310136157, + "grad_norm": 2.4913211552340067, + "learning_rate": 5.1137875850304545e-06, + "loss": 0.6101, + "step": 222 + }, + { + "epoch": 0.6747352496217852, + "grad_norm": 1.783893713744158, + "learning_rate": 5.0283673374734546e-06, + "loss": 0.6318, + "step": 223 + }, + { + "epoch": 0.6777609682299546, + "grad_norm": 1.7107399741480709, + "learning_rate": 4.943426266220156e-06, + "loss": 0.6129, + "step": 224 + }, + { + "epoch": 0.680786686838124, + "grad_norm": 1.836722334744728, + "learning_rate": 4.858972558067784e-06, + "loss": 0.5994, + "step": 225 + }, + { + "epoch": 0.6838124054462935, + "grad_norm": 2.108359381810314, + "learning_rate": 4.775014352840512e-06, + "loss": 0.615, + "step": 226 + }, + { + "epoch": 0.686838124054463, + "grad_norm": 2.4528528203456386, + "learning_rate": 4.691559742604906e-06, + "loss": 0.6218, + "step": 227 + }, + { + "epoch": 0.6898638426626323, + "grad_norm": 2.9562372553126677, + "learning_rate": 4.608616770889998e-06, + "loss": 0.6055, + "step": 228 + }, + { + "epoch": 0.6928895612708018, + "grad_norm": 2.23721294737496, + "learning_rate": 4.526193431912038e-06, + "loss": 0.6297, + "step": 229 + }, + { + "epoch": 0.6959152798789713, + "grad_norm": 2.948141809060743, + "learning_rate": 4.444297669803981e-06, + "loss": 0.6631, + "step": 230 + }, + { + "epoch": 0.6989409984871406, + "grad_norm": 2.7688260603953614, + "learning_rate": 4.362937377849832e-06, + "loss": 0.6086, + "step": 231 + }, + { + "epoch": 0.7019667170953101, + "grad_norm": 1.9975679057945175, + "learning_rate": 4.282120397723879e-06, + "loss": 0.6059, + "step": 232 + }, + { + "epoch": 0.7049924357034796, + "grad_norm": 2.0441528068066583, + "learning_rate": 4.2018545187348645e-06, + "loss": 0.612, + "step": 233 + }, + { + "epoch": 0.708018154311649, + "grad_norm": 1.7827953299318366, + "learning_rate": 4.12214747707527e-06, + "loss": 0.6223, + "step": 234 + }, + { + "epoch": 0.7110438729198184, + "grad_norm": 1.910631194940744, + "learning_rate": 4.043006955075667e-06, + "loss": 0.6216, + "step": 235 + }, + { + "epoch": 0.7140695915279879, + "grad_norm": 2.2327872483205664, + "learning_rate": 3.964440580464286e-06, + "loss": 0.6363, + "step": 236 + }, + { + "epoch": 0.7170953101361573, + "grad_norm": 1.8553908596509907, + "learning_rate": 3.8864559256318375e-06, + "loss": 0.6121, + "step": 237 + }, + { + "epoch": 0.7201210287443268, + "grad_norm": 1.8523636314789427, + "learning_rate": 3.8090605069016596e-06, + "loss": 0.6496, + "step": 238 + }, + { + "epoch": 0.7231467473524962, + "grad_norm": 1.9267223174964587, + "learning_rate": 3.7322617838053066e-06, + "loss": 0.6221, + "step": 239 + }, + { + "epoch": 0.7261724659606656, + "grad_norm": 2.5477420067834733, + "learning_rate": 3.6560671583635467e-06, + "loss": 0.6533, + "step": 240 + }, + { + "epoch": 0.7291981845688351, + "grad_norm": 2.086091207989975, + "learning_rate": 3.58048397437297e-06, + "loss": 0.6557, + "step": 241 + }, + { + "epoch": 0.7322239031770046, + "grad_norm": 1.7198590873048778, + "learning_rate": 3.505519516698165e-06, + "loss": 0.6089, + "step": 242 + }, + { + "epoch": 0.735249621785174, + "grad_norm": 1.7451800185500015, + "learning_rate": 3.4311810105695875e-06, + "loss": 0.5903, + "step": 243 + }, + { + "epoch": 0.7382753403933434, + "grad_norm": 1.827742556162237, + "learning_rate": 3.3574756208871862e-06, + "loss": 0.6189, + "step": 244 + }, + { + "epoch": 0.7413010590015129, + "grad_norm": 1.738634389252119, + "learning_rate": 3.284410451529816e-06, + "loss": 0.6098, + "step": 245 + }, + { + "epoch": 0.7443267776096822, + "grad_norm": 2.4650262506220617, + "learning_rate": 3.2119925446705824e-06, + "loss": 0.622, + "step": 246 + }, + { + "epoch": 0.7473524962178517, + "grad_norm": 1.9057562551451306, + "learning_rate": 3.140228880098074e-06, + "loss": 0.6213, + "step": 247 + }, + { + "epoch": 0.7503782148260212, + "grad_norm": 1.7767572926898476, + "learning_rate": 3.069126374543643e-06, + "loss": 0.6103, + "step": 248 + }, + { + "epoch": 0.7534039334341907, + "grad_norm": 1.7615575905509537, + "learning_rate": 2.998691881014765e-06, + "loss": 0.5892, + "step": 249 + }, + { + "epoch": 0.7534039334341907, + "eval_loss": 0.6087134480476379, + "eval_runtime": 88.9685, + "eval_samples_per_second": 47.545, + "eval_steps_per_second": 0.753, + "step": 249 + }, + { + "epoch": 0.75642965204236, + "grad_norm": 1.9054940488072203, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.6151, + "step": 250 + }, + { + "epoch": 0.7594553706505295, + "grad_norm": 1.7961329037995988, + "learning_rate": 2.859854019487318e-06, + "loss": 0.6122, + "step": 251 + }, + { + "epoch": 0.762481089258699, + "grad_norm": 2.008011636688375, + "learning_rate": 2.791464032970812e-06, + "loss": 0.6134, + "step": 252 + }, + { + "epoch": 0.7655068078668684, + "grad_norm": 1.802889816029653, + "learning_rate": 2.723768820154251e-06, + "loss": 0.606, + "step": 253 + }, + { + "epoch": 0.7685325264750378, + "grad_norm": 1.7123288009118462, + "learning_rate": 2.656774905643147e-06, + "loss": 0.599, + "step": 254 + }, + { + "epoch": 0.7715582450832073, + "grad_norm": 1.872010012601884, + "learning_rate": 2.5904887464504115e-06, + "loss": 0.6122, + "step": 255 + }, + { + "epoch": 0.7745839636913767, + "grad_norm": 1.7931604063599083, + "learning_rate": 2.5249167313740307e-06, + "loss": 0.6184, + "step": 256 + }, + { + "epoch": 0.7776096822995462, + "grad_norm": 5.906281110948661, + "learning_rate": 2.4600651803813057e-06, + "loss": 0.6212, + "step": 257 + }, + { + "epoch": 0.7806354009077155, + "grad_norm": 1.796522835609395, + "learning_rate": 2.395940343999691e-06, + "loss": 0.6275, + "step": 258 + }, + { + "epoch": 0.783661119515885, + "grad_norm": 2.502567467661614, + "learning_rate": 2.332548402714385e-06, + "loss": 0.6131, + "step": 259 + }, + { + "epoch": 0.7866868381240545, + "grad_norm": 1.7721464863028324, + "learning_rate": 2.26989546637263e-06, + "loss": 0.6312, + "step": 260 + }, + { + "epoch": 0.789712556732224, + "grad_norm": 2.7446340546388903, + "learning_rate": 2.207987573594833e-06, + "loss": 0.6305, + "step": 261 + }, + { + "epoch": 0.7927382753403933, + "grad_norm": 1.8224754193860417, + "learning_rate": 2.146830691192553e-06, + "loss": 0.5943, + "step": 262 + }, + { + "epoch": 0.7957639939485628, + "grad_norm": 2.039467832689681, + "learning_rate": 2.086430713593397e-06, + "loss": 0.6301, + "step": 263 + }, + { + "epoch": 0.7987897125567323, + "grad_norm": 10.038334257050249, + "learning_rate": 2.02679346227293e-06, + "loss": 0.6251, + "step": 264 + }, + { + "epoch": 0.8018154311649016, + "grad_norm": 1.77925140677465, + "learning_rate": 1.967924685193552e-06, + "loss": 0.5986, + "step": 265 + }, + { + "epoch": 0.8048411497730711, + "grad_norm": 1.8193241378693117, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.6426, + "step": 266 + }, + { + "epoch": 0.8078668683812406, + "grad_norm": 1.7632326770584787, + "learning_rate": 1.8525151747251058e-06, + "loss": 0.5756, + "step": 267 + }, + { + "epoch": 0.81089258698941, + "grad_norm": 1.7943311933481843, + "learning_rate": 1.7959855647448642e-06, + "loss": 0.6144, + "step": 268 + }, + { + "epoch": 0.8139183055975794, + "grad_norm": 1.7517176357017044, + "learning_rate": 1.7402466747512704e-06, + "loss": 0.6105, + "step": 269 + }, + { + "epoch": 0.8169440242057489, + "grad_norm": 2.0420002751411577, + "learning_rate": 1.6853038769745466e-06, + "loss": 0.6167, + "step": 270 + }, + { + "epoch": 0.8199697428139183, + "grad_norm": 1.675277078717734, + "learning_rate": 1.6311624669159064e-06, + "loss": 0.5983, + "step": 271 + }, + { + "epoch": 0.8229954614220878, + "grad_norm": 1.8699956812574887, + "learning_rate": 1.577827662837136e-06, + "loss": 0.5887, + "step": 272 + }, + { + "epoch": 0.8260211800302572, + "grad_norm": 1.942186575025359, + "learning_rate": 1.5253046052576559e-06, + "loss": 0.6227, + "step": 273 + }, + { + "epoch": 0.8290468986384266, + "grad_norm": 1.7140913453292885, + "learning_rate": 1.4735983564590784e-06, + "loss": 0.5889, + "step": 274 + }, + { + "epoch": 0.8320726172465961, + "grad_norm": 1.8602199384237519, + "learning_rate": 1.4227138999972801e-06, + "loss": 0.6156, + "step": 275 + }, + { + "epoch": 0.8350983358547656, + "grad_norm": 1.7357383944516909, + "learning_rate": 1.3726561402220818e-06, + "loss": 0.6231, + "step": 276 + }, + { + "epoch": 0.8381240544629349, + "grad_norm": 2.049034834900203, + "learning_rate": 1.3234299018045615e-06, + "loss": 0.5762, + "step": 277 + }, + { + "epoch": 0.8411497730711044, + "grad_norm": 1.8899426483696566, + "learning_rate": 1.2750399292720284e-06, + "loss": 0.6271, + "step": 278 + }, + { + "epoch": 0.8441754916792739, + "grad_norm": 1.7902118508514344, + "learning_rate": 1.2274908865507595e-06, + "loss": 0.5865, + "step": 279 + }, + { + "epoch": 0.8472012102874432, + "grad_norm": 1.7796598756657644, + "learning_rate": 1.1807873565164507e-06, + "loss": 0.5966, + "step": 280 + }, + { + "epoch": 0.8502269288956127, + "grad_norm": 1.998204889741657, + "learning_rate": 1.1349338405525368e-06, + "loss": 0.6121, + "step": 281 + }, + { + "epoch": 0.8532526475037822, + "grad_norm": 1.8591997510503189, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.5947, + "step": 282 + }, + { + "epoch": 0.8562783661119516, + "grad_norm": 2.091403547509914, + "learning_rate": 1.045794446313031e-06, + "loss": 0.6099, + "step": 283 + }, + { + "epoch": 0.859304084720121, + "grad_norm": 5.552258074388298, + "learning_rate": 1.0025171594777872e-06, + "loss": 0.6001, + "step": 284 + }, + { + "epoch": 0.8623298033282905, + "grad_norm": 1.984667974599296, + "learning_rate": 9.601070687655667e-07, + "loss": 0.6214, + "step": 285 + }, + { + "epoch": 0.8653555219364599, + "grad_norm": 1.9108222703571809, + "learning_rate": 9.185682617491865e-07, + "loss": 0.6326, + "step": 286 + }, + { + "epoch": 0.8683812405446294, + "grad_norm": 1.7777131621422688, + "learning_rate": 8.779047420253239e-07, + "loss": 0.6071, + "step": 287 + }, + { + "epoch": 0.8714069591527988, + "grad_norm": 1.7623805284409448, + "learning_rate": 8.381204288286415e-07, + "loss": 0.6189, + "step": 288 + }, + { + "epoch": 0.8744326777609682, + "grad_norm": 1.7371506833039154, + "learning_rate": 7.992191566540519e-07, + "loss": 0.5895, + "step": 289 + }, + { + "epoch": 0.8774583963691377, + "grad_norm": 1.8174334539956483, + "learning_rate": 7.612046748871327e-07, + "loss": 0.6241, + "step": 290 + }, + { + "epoch": 0.8804841149773072, + "grad_norm": 2.152708215011433, + "learning_rate": 7.240806474427598e-07, + "loss": 0.6081, + "step": 291 + }, + { + "epoch": 0.8835098335854765, + "grad_norm": 1.9524519551038517, + "learning_rate": 6.878506524119644e-07, + "loss": 0.5908, + "step": 292 + }, + { + "epoch": 0.886535552193646, + "grad_norm": 4.280045389112553, + "learning_rate": 6.525181817170756e-07, + "loss": 0.593, + "step": 293 + }, + { + "epoch": 0.8895612708018155, + "grad_norm": 1.7830376473459097, + "learning_rate": 6.180866407751595e-07, + "loss": 0.6492, + "step": 294 + }, + { + "epoch": 0.8925869894099848, + "grad_norm": 1.7849324700948386, + "learning_rate": 5.845593481697931e-07, + "loss": 0.5877, + "step": 295 + }, + { + "epoch": 0.8956127080181543, + "grad_norm": 1.7200906987170332, + "learning_rate": 5.519395353312195e-07, + "loss": 0.5976, + "step": 296 + }, + { + "epoch": 0.8986384266263238, + "grad_norm": 2.523328519492762, + "learning_rate": 5.20230346224897e-07, + "loss": 0.6417, + "step": 297 + }, + { + "epoch": 0.9016641452344932, + "grad_norm": 1.7332890626025772, + "learning_rate": 4.894348370484648e-07, + "loss": 0.6123, + "step": 298 + }, + { + "epoch": 0.9046898638426626, + "grad_norm": 1.7313495120885292, + "learning_rate": 4.5955597593719593e-07, + "loss": 0.6123, + "step": 299 + }, + { + "epoch": 0.9077155824508321, + "grad_norm": 1.9089514092546682, + "learning_rate": 4.305966426779118e-07, + "loss": 0.6209, + "step": 300 + }, + { + "epoch": 0.9107413010590015, + "grad_norm": 1.7209932552420357, + "learning_rate": 4.025596284314259e-07, + "loss": 0.5757, + "step": 301 + }, + { + "epoch": 0.913767019667171, + "grad_norm": 1.875857005840312, + "learning_rate": 3.7544763546352834e-07, + "loss": 0.597, + "step": 302 + }, + { + "epoch": 0.9167927382753404, + "grad_norm": 2.170147189258584, + "learning_rate": 3.492632768845261e-07, + "loss": 0.6199, + "step": 303 + }, + { + "epoch": 0.9198184568835098, + "grad_norm": 1.932298321660113, + "learning_rate": 3.2400907639740243e-07, + "loss": 0.5874, + "step": 304 + }, + { + "epoch": 0.9228441754916793, + "grad_norm": 1.9060668531244895, + "learning_rate": 2.996874680545603e-07, + "loss": 0.602, + "step": 305 + }, + { + "epoch": 0.9258698940998488, + "grad_norm": 2.876936994230681, + "learning_rate": 2.7630079602323447e-07, + "loss": 0.6068, + "step": 306 + }, + { + "epoch": 0.9288956127080181, + "grad_norm": 1.6979020112569279, + "learning_rate": 2.5385131435955e-07, + "loss": 0.59, + "step": 307 + }, + { + "epoch": 0.9319213313161876, + "grad_norm": 1.7392856626246644, + "learning_rate": 2.3234118679127615e-07, + "loss": 0.5997, + "step": 308 + }, + { + "epoch": 0.9349470499243571, + "grad_norm": 1.805002057886788, + "learning_rate": 2.117724865092774e-07, + "loss": 0.6114, + "step": 309 + }, + { + "epoch": 0.9379727685325264, + "grad_norm": 1.8063850245594506, + "learning_rate": 1.921471959676957e-07, + "loss": 0.6309, + "step": 310 + }, + { + "epoch": 0.9409984871406959, + "grad_norm": 1.7781982924325161, + "learning_rate": 1.734672066928822e-07, + "loss": 0.6006, + "step": 311 + }, + { + "epoch": 0.9440242057488654, + "grad_norm": 1.8021342311499136, + "learning_rate": 1.5573431910108404e-07, + "loss": 0.5949, + "step": 312 + }, + { + "epoch": 0.9470499243570348, + "grad_norm": 1.6024335443693254, + "learning_rate": 1.3895024232491338e-07, + "loss": 0.5801, + "step": 313 + }, + { + "epoch": 0.9500756429652042, + "grad_norm": 1.9581264928414572, + "learning_rate": 1.231165940486234e-07, + "loss": 0.6268, + "step": 314 + }, + { + "epoch": 0.9531013615733737, + "grad_norm": 1.7315183379261287, + "learning_rate": 1.0823490035218986e-07, + "loss": 0.6055, + "step": 315 + }, + { + "epoch": 0.9561270801815431, + "grad_norm": 2.2785089633330364, + "learning_rate": 9.43065955642275e-08, + "loss": 0.6048, + "step": 316 + }, + { + "epoch": 0.9591527987897126, + "grad_norm": 2.1203864893122013, + "learning_rate": 8.133302212373961e-08, + "loss": 0.618, + "step": 317 + }, + { + "epoch": 0.962178517397882, + "grad_norm": 1.7790483418175567, + "learning_rate": 6.931543045073708e-08, + "loss": 0.5928, + "step": 318 + }, + { + "epoch": 0.9652042360060514, + "grad_norm": 1.787018555262936, + "learning_rate": 5.8254978825718065e-08, + "loss": 0.5936, + "step": 319 + }, + { + "epoch": 0.9682299546142209, + "grad_norm": 2.023165603430636, + "learning_rate": 4.815273327803183e-08, + "loss": 0.5966, + "step": 320 + }, + { + "epoch": 0.9712556732223904, + "grad_norm": 1.744400598145924, + "learning_rate": 3.900966748312862e-08, + "loss": 0.6196, + "step": 321 + }, + { + "epoch": 0.9742813918305597, + "grad_norm": 2.6734084131681017, + "learning_rate": 3.082666266872036e-08, + "loss": 0.6032, + "step": 322 + }, + { + "epoch": 0.9773071104387292, + "grad_norm": 2.5457151249929115, + "learning_rate": 2.3604507529843e-08, + "loss": 0.6207, + "step": 323 + }, + { + "epoch": 0.9803328290468987, + "grad_norm": 48.615098546656974, + "learning_rate": 1.7343898152841765e-08, + "loss": 0.6019, + "step": 324 + }, + { + "epoch": 0.983358547655068, + "grad_norm": 1.7393804404319622, + "learning_rate": 1.2045437948275952e-08, + "loss": 0.5871, + "step": 325 + }, + { + "epoch": 0.9863842662632375, + "grad_norm": 1.934884937544193, + "learning_rate": 7.70963759277099e-09, + "loss": 0.5849, + "step": 326 + }, + { + "epoch": 0.989409984871407, + "grad_norm": 1.7879449709638233, + "learning_rate": 4.336914979787832e-09, + "loss": 0.625, + "step": 327 + }, + { + "epoch": 0.9924357034795764, + "grad_norm": 1.778813518283478, + "learning_rate": 1.9275951793518154e-09, + "loss": 0.6118, + "step": 328 + }, + { + "epoch": 0.9954614220877458, + "grad_norm": 3.4972680132091467, + "learning_rate": 4.819104067199653e-10, + "loss": 0.6033, + "step": 329 + }, + { + "epoch": 0.9984871406959153, + "grad_norm": 1.8469038074218784, + "learning_rate": 0.0, + "loss": 0.6178, + "step": 330 + }, + { + "epoch": 0.9984871406959153, + "step": 330, + "total_flos": 551924772372480.0, + "train_loss": 0.14928891153046578, + "train_runtime": 940.1371, + "train_samples_per_second": 44.997, + "train_steps_per_second": 0.351 + } + ], + "logging_steps": 1.0, + "max_steps": 330, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 83, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 551924772372480.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..fe8baaa --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2df6a2f148bfe6db0dcca9fbb16b2761f7b4a59ca98eff55f49482d830e93860 +size 7032