From 9e3d4b3f8bde71fafe39c02176263b4579bdc248 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 13 May 2026 01:56:34 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: luohy/SAIL-7b Source: Original Platform --- .gitattributes | 35 ++ README.md | 14 + config.json | 26 ++ generation_config.json | 10 + pytorch_model-00001-of-00002.bin | 3 + pytorch_model-00002-of-00002.bin | 3 + pytorch_model.bin.index.json | 298 +++++++++++++ special_tokens_map.json | 24 ++ tokenizer.model | 3 + tokenizer_config.json | 37 ++ trainer_state.json | 718 +++++++++++++++++++++++++++++++ training_args.bin | 3 + 12 files changed, 1174 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 pytorch_model-00001-of-00002.bin create mode 100644 pytorch_model-00002-of-00002.bin create mode 100644 pytorch_model.bin.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.model create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..73454e9 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +--- +license: apache-2.0 +language: +- en +pipeline_tag: text-generation +--- + +# Search Augmented Instruction Learning (SAIL-7b) LLM + +The language model is fine-tuned to use retrieval model and search engines. + +- Paper: [SAIL: Search Augmented Instruction Learning](https://arxiv.org/pdf/2305.15225.pdf). +- Github: [Code](https://github.com/luohongyin/SAIL) +- Try the model: [Demo](https://huggingface.co/spaces/luohy/SAIL-7B) \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..c66db71 --- /dev/null +++ b/config.json @@ -0,0 +1,26 @@ +{ + "_name_or_path": "/data/sls/d/llm/llama2-7b", + "architectures": [ + "LlamaForCausalLM" + ], + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.33.1", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..64d0e71 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "max_length": 4096, + "pad_token_id": 0, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.33.1" +} diff --git a/pytorch_model-00001-of-00002.bin b/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000..1cfc18d --- /dev/null +++ b/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd283fcf100b717b8c17e80ce6fb3b43116d4b9a6de66887881e7dd7c009afa +size 9976620122 diff --git a/pytorch_model-00002-of-00002.bin b/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000..d94880b --- /dev/null +++ b/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a604583769f7e56c5a21ac59b563173c7649cc8c18377363afd5ee3591c020a0 +size 3500310787 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..6cb539a --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 13476831232 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00002-of-00002.bin", + "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.norm.weight": "pytorch_model-00002-of-00002.bin" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14761dc --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..6c00c74 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..6b9985a --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,37 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 1800, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": true +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..f2ef0b9 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,718 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9998381091144566, + "eval_steps": 500, + "global_step": 1158, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03, + "learning_rate": 5.7142857142857145e-06, + "loss": 1.3326, + "step": 10 + }, + { + "epoch": 0.05, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.841, + "step": 20 + }, + { + "epoch": 0.08, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.7663, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 1.9999021765072576e-05, + "loss": 0.7506, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 1.999119703394801e-05, + "loss": 0.7395, + "step": 50 + }, + { + "epoch": 0.16, + "learning_rate": 1.9975553694939593e-05, + "loss": 0.7412, + "step": 60 + }, + { + "epoch": 0.18, + "learning_rate": 1.9952103989737013e-05, + "loss": 0.7374, + "step": 70 + }, + { + "epoch": 0.21, + "learning_rate": 1.9920866268899203e-05, + "loss": 0.7292, + "step": 80 + }, + { + "epoch": 0.23, + "learning_rate": 1.988186497749411e-05, + "loss": 0.7117, + "step": 90 + }, + { + "epoch": 0.26, + "learning_rate": 1.9835130635969228e-05, + "loss": 0.7154, + "step": 100 + }, + { + "epoch": 0.28, + "learning_rate": 1.978069981626783e-05, + "loss": 0.7063, + "step": 110 + }, + { + "epoch": 0.31, + "learning_rate": 1.9718615113209588e-05, + "loss": 0.7022, + "step": 120 + }, + { + "epoch": 0.34, + "learning_rate": 1.9648925111158037e-05, + "loss": 0.6856, + "step": 130 + }, + { + "epoch": 0.36, + "learning_rate": 1.9571684346000877e-05, + "loss": 0.6846, + "step": 140 + }, + { + "epoch": 0.39, + "learning_rate": 1.9486953262472945e-05, + "loss": 0.6944, + "step": 150 + }, + { + "epoch": 0.41, + "learning_rate": 1.9394798166855205e-05, + "loss": 0.6608, + "step": 160 + }, + { + "epoch": 0.44, + "learning_rate": 1.9295291175086782e-05, + "loss": 0.6902, + "step": 170 + }, + { + "epoch": 0.47, + "learning_rate": 1.9188510156330676e-05, + "loss": 0.6819, + "step": 180 + }, + { + "epoch": 0.49, + "learning_rate": 1.9074538672037254e-05, + "loss": 0.6637, + "step": 190 + }, + { + "epoch": 0.52, + "learning_rate": 1.8953465910553274e-05, + "loss": 0.6487, + "step": 200 + }, + { + "epoch": 0.54, + "learning_rate": 1.8825386617327586e-05, + "loss": 0.6593, + "step": 210 + }, + { + "epoch": 0.57, + "learning_rate": 1.869040102076809e-05, + "loss": 0.6442, + "step": 220 + }, + { + "epoch": 0.6, + "learning_rate": 1.8548614753808046e-05, + "loss": 0.648, + "step": 230 + }, + { + "epoch": 0.62, + "learning_rate": 1.8400138771243056e-05, + "loss": 0.6753, + "step": 240 + }, + { + "epoch": 0.65, + "learning_rate": 1.8245089262903407e-05, + "loss": 0.6119, + "step": 250 + }, + { + "epoch": 0.67, + "learning_rate": 1.8083587562729796e-05, + "loss": 0.6314, + "step": 260 + }, + { + "epoch": 0.7, + "learning_rate": 1.7915760053823454e-05, + "loss": 0.6128, + "step": 270 + }, + { + "epoch": 0.73, + "learning_rate": 1.7741738069545126e-05, + "loss": 0.6201, + "step": 280 + }, + { + "epoch": 0.75, + "learning_rate": 1.756165779074016e-05, + "loss": 0.6087, + "step": 290 + }, + { + "epoch": 0.78, + "learning_rate": 1.7375660139170276e-05, + "loss": 0.5939, + "step": 300 + }, + { + "epoch": 0.8, + "learning_rate": 1.7183890667235245e-05, + "loss": 0.5884, + "step": 310 + }, + { + "epoch": 0.83, + "learning_rate": 1.698649944407093e-05, + "loss": 0.6041, + "step": 320 + }, + { + "epoch": 0.85, + "learning_rate": 1.6783640938112742e-05, + "loss": 0.5939, + "step": 330 + }, + { + "epoch": 0.88, + "learning_rate": 1.6575473896216393e-05, + "loss": 0.5867, + "step": 340 + }, + { + "epoch": 0.91, + "learning_rate": 1.6362161219430618e-05, + "loss": 0.5946, + "step": 350 + }, + { + "epoch": 0.93, + "learning_rate": 1.6143869835519003e-05, + "loss": 0.5657, + "step": 360 + }, + { + "epoch": 0.96, + "learning_rate": 1.592077056833073e-05, + "loss": 0.5826, + "step": 370 + }, + { + "epoch": 0.98, + "learning_rate": 1.5693038004122415e-05, + "loss": 0.5883, + "step": 380 + }, + { + "epoch": 1.01, + "learning_rate": 1.546085035493571e-05, + "loss": 0.5367, + "step": 390 + }, + { + "epoch": 1.04, + "learning_rate": 1.522438931913751e-05, + "loss": 0.4847, + "step": 400 + }, + { + "epoch": 1.06, + "learning_rate": 1.4983839939231986e-05, + "loss": 0.487, + "step": 410 + }, + { + "epoch": 1.09, + "learning_rate": 1.4739390457055593e-05, + "loss": 0.4939, + "step": 420 + }, + { + "epoch": 1.11, + "learning_rate": 1.4491232166468532e-05, + "loss": 0.4791, + "step": 430 + }, + { + "epoch": 1.14, + "learning_rate": 1.4239559263657764e-05, + "loss": 0.499, + "step": 440 + }, + { + "epoch": 1.17, + "learning_rate": 1.3984568695168877e-05, + "loss": 0.4845, + "step": 450 + }, + { + "epoch": 1.19, + "learning_rate": 1.3726460003785609e-05, + "loss": 0.4752, + "step": 460 + }, + { + "epoch": 1.22, + "learning_rate": 1.3465435172377733e-05, + "loss": 0.4911, + "step": 470 + }, + { + "epoch": 1.24, + "learning_rate": 1.3201698465839414e-05, + "loss": 0.471, + "step": 480 + }, + { + "epoch": 1.27, + "learning_rate": 1.2935456271241784e-05, + "loss": 0.4681, + "step": 490 + }, + { + "epoch": 1.3, + "learning_rate": 1.266691693632483e-05, + "loss": 0.4596, + "step": 500 + }, + { + "epoch": 1.32, + "learning_rate": 1.2396290606454912e-05, + "loss": 0.4635, + "step": 510 + }, + { + "epoch": 1.35, + "learning_rate": 1.2123789060175593e-05, + "loss": 0.477, + "step": 520 + }, + { + "epoch": 1.37, + "learning_rate": 1.1849625543480402e-05, + "loss": 0.4493, + "step": 530 + }, + { + "epoch": 1.4, + "learning_rate": 1.1574014602937274e-05, + "loss": 0.4637, + "step": 540 + }, + { + "epoch": 1.42, + "learning_rate": 1.129717191779517e-05, + "loss": 0.4229, + "step": 550 + }, + { + "epoch": 1.45, + "learning_rate": 1.1019314131204394e-05, + "loss": 0.4225, + "step": 560 + }, + { + "epoch": 1.48, + "learning_rate": 1.0740658680682523e-05, + "loss": 0.4398, + "step": 570 + }, + { + "epoch": 1.5, + "learning_rate": 1.0461423627958803e-05, + "loss": 0.4286, + "step": 580 + }, + { + "epoch": 1.53, + "learning_rate": 1.0181827488329975e-05, + "loss": 0.428, + "step": 590 + }, + { + "epoch": 1.55, + "learning_rate": 9.902089059661206e-06, + "loss": 0.4289, + "step": 600 + }, + { + "epoch": 1.58, + "learning_rate": 9.622427251165921e-06, + "loss": 0.4253, + "step": 610 + }, + { + "epoch": 1.61, + "learning_rate": 9.34306091209845e-06, + "loss": 0.413, + "step": 620 + }, + { + "epoch": 1.63, + "learning_rate": 9.064208660493614e-06, + "loss": 0.436, + "step": 630 + }, + { + "epoch": 1.66, + "learning_rate": 8.786088712087269e-06, + "loss": 0.4113, + "step": 640 + }, + { + "epoch": 1.68, + "learning_rate": 8.508918709551643e-06, + "loss": 0.4073, + "step": 650 + }, + { + "epoch": 1.71, + "learning_rate": 8.232915552179152e-06, + "loss": 0.4055, + "step": 660 + }, + { + "epoch": 1.74, + "learning_rate": 7.958295226147911e-06, + "loss": 0.4133, + "step": 670 + }, + { + "epoch": 1.76, + "learning_rate": 7.685272635501836e-06, + "loss": 0.3992, + "step": 680 + }, + { + "epoch": 1.79, + "learning_rate": 7.414061433977566e-06, + "loss": 0.3877, + "step": 690 + }, + { + "epoch": 1.81, + "learning_rate": 7.144873857809769e-06, + "loss": 0.3814, + "step": 700 + }, + { + "epoch": 1.84, + "learning_rate": 6.8779205596457885e-06, + "loss": 0.3643, + "step": 710 + }, + { + "epoch": 1.86, + "learning_rate": 6.613410443699453e-06, + "loss": 0.3764, + "step": 720 + }, + { + "epoch": 1.89, + "learning_rate": 6.3515505022731805e-06, + "loss": 0.3554, + "step": 730 + }, + { + "epoch": 1.92, + "learning_rate": 6.092545653776194e-06, + "loss": 0.3686, + "step": 740 + }, + { + "epoch": 1.94, + "learning_rate": 5.836598582365717e-06, + "loss": 0.3535, + "step": 750 + }, + { + "epoch": 1.97, + "learning_rate": 5.583909579336528e-06, + "loss": 0.3883, + "step": 760 + }, + { + "epoch": 1.99, + "learning_rate": 5.334676386383093e-06, + "loss": 0.3548, + "step": 770 + }, + { + "epoch": 2.02, + "learning_rate": 5.089094040856877e-06, + "loss": 0.334, + "step": 780 + }, + { + "epoch": 2.05, + "learning_rate": 4.847354723139893e-06, + "loss": 0.3223, + "step": 790 + }, + { + "epoch": 2.07, + "learning_rate": 4.609647606254046e-06, + "loss": 0.315, + "step": 800 + }, + { + "epoch": 2.1, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.3239, + "step": 810 + }, + { + "epoch": 2.12, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.3096, + "step": 820 + }, + { + "epoch": 2.15, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.2924, + "step": 830 + }, + { + "epoch": 2.18, + "learning_rate": 1.9999021765072576e-05, + "loss": 0.2998, + "step": 840 + }, + { + "epoch": 2.2, + "learning_rate": 1.999119703394801e-05, + "loss": 0.291, + "step": 850 + }, + { + "epoch": 2.23, + "learning_rate": 1.9975553694939593e-05, + "loss": 0.3037, + "step": 860 + }, + { + "epoch": 2.25, + "learning_rate": 1.9952103989737013e-05, + "loss": 0.2889, + "step": 870 + }, + { + "epoch": 2.28, + "learning_rate": 1.9920866268899203e-05, + "loss": 0.3032, + "step": 880 + }, + { + "epoch": 2.31, + "learning_rate": 1.988186497749411e-05, + "loss": 0.3041, + "step": 890 + }, + { + "epoch": 2.33, + "learning_rate": 1.9835130635969228e-05, + "loss": 0.2982, + "step": 900 + }, + { + "epoch": 2.36, + "learning_rate": 1.978069981626783e-05, + "loss": 0.271, + "step": 910 + }, + { + "epoch": 2.38, + "learning_rate": 1.9718615113209588e-05, + "loss": 0.2896, + "step": 920 + }, + { + "epoch": 2.41, + "learning_rate": 1.9648925111158037e-05, + "loss": 0.284, + "step": 930 + }, + { + "epoch": 2.44, + "learning_rate": 1.9571684346000877e-05, + "loss": 0.2854, + "step": 940 + }, + { + "epoch": 2.46, + "learning_rate": 1.9486953262472945e-05, + "loss": 0.2717, + "step": 950 + }, + { + "epoch": 2.49, + "learning_rate": 1.9394798166855205e-05, + "loss": 0.2885, + "step": 960 + }, + { + "epoch": 2.51, + "learning_rate": 1.9295291175086782e-05, + "loss": 0.2797, + "step": 970 + }, + { + "epoch": 2.54, + "learning_rate": 1.9188510156330676e-05, + "loss": 0.2627, + "step": 980 + }, + { + "epoch": 2.56, + "learning_rate": 1.9074538672037254e-05, + "loss": 0.2887, + "step": 990 + }, + { + "epoch": 2.59, + "learning_rate": 1.8953465910553274e-05, + "loss": 0.2671, + "step": 1000 + }, + { + "epoch": 2.62, + "learning_rate": 1.8825386617327586e-05, + "loss": 0.2786, + "step": 1010 + }, + { + "epoch": 2.64, + "learning_rate": 1.869040102076809e-05, + "loss": 0.2705, + "step": 1020 + }, + { + "epoch": 2.67, + "learning_rate": 1.8548614753808046e-05, + "loss": 0.2637, + "step": 1030 + }, + { + "epoch": 2.69, + "learning_rate": 1.8400138771243056e-05, + "loss": 0.2599, + "step": 1040 + }, + { + "epoch": 2.72, + "learning_rate": 1.8245089262903407e-05, + "loss": 0.2632, + "step": 1050 + }, + { + "epoch": 2.75, + "learning_rate": 1.8083587562729796e-05, + "loss": 0.2529, + "step": 1060 + }, + { + "epoch": 2.77, + "learning_rate": 1.7915760053823454e-05, + "loss": 0.2387, + "step": 1070 + }, + { + "epoch": 2.8, + "learning_rate": 1.7741738069545126e-05, + "loss": 0.2451, + "step": 1080 + }, + { + "epoch": 2.82, + "learning_rate": 1.756165779074016e-05, + "loss": 0.2542, + "step": 1090 + }, + { + "epoch": 2.85, + "learning_rate": 1.7375660139170276e-05, + "loss": 0.2492, + "step": 1100 + }, + { + "epoch": 2.88, + "learning_rate": 1.7183890667235245e-05, + "loss": 0.2344, + "step": 1110 + }, + { + "epoch": 2.9, + "learning_rate": 1.698649944407093e-05, + "loss": 0.249, + "step": 1120 + }, + { + "epoch": 2.93, + "learning_rate": 1.6783640938112742e-05, + "loss": 0.2457, + "step": 1130 + }, + { + "epoch": 2.95, + "learning_rate": 1.6575473896216393e-05, + "loss": 0.2497, + "step": 1140 + }, + { + "epoch": 2.98, + "learning_rate": 1.6362161219430618e-05, + "loss": 0.2449, + "step": 1150 + }, + { + "epoch": 3.0, + "step": 1158, + "total_flos": 1.0577162556235842e+19, + "train_loss": 0.08461039424560231, + "train_runtime": 32175.0491, + "train_samples_per_second": 4.607, + "train_steps_per_second": 0.036 + } + ], + "logging_steps": 10, + "max_steps": 1158, + "num_train_epochs": 3, + "save_steps": 400, + "total_flos": 1.0577162556235842e+19, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..ea8ed06 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dec8d89422f3813e06b81a1e6d7cf08c8459b5d0b1542275c6d5e34bc59f5528 +size 4987