commit ab771acdea8304e8fb036272b6e655abe3aeb093 Author: ModelHub XC Date: Wed Jun 3 05:09:13 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: mlfoundations-dev/d1_science_gpt_0.3k Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b821ec1 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,56 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +training_args.bin filter=lfs diff=lfs merge=lfs -text +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bb83fab --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +library_name: transformers +license: other +base_model: Qwen/Qwen2.5-7B-Instruct +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: d1_science_gpt_0.3k + results: [] +--- + + + +# d1_science_gpt_0.3k + +This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the mlfoundations-dev/d1_science_gpt_0.3k dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 1 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 16 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 32 +- total_eval_batch_size: 128 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 13.0 + +### Training results + + + +### Framework versions + +- Transformers 4.46.1 +- Pytorch 2.6.0a0+ecf3bae40a.nv25.01 +- Datasets 3.5.0 +- Tokenizers 0.20.3 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..482ced4 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,24 @@ +{ + "": 151658, + "": 151657, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..fd7c9b7 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 13.0, + "total_flos": 3.2858174247665664e+16, + "train_loss": 0.2939408891905959, + "train_runtime": 1300.2962, + "train_samples_per_second": 3.159, + "train_steps_per_second": 0.1 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..a916f0b --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "_name_or_path": "/p/data1/mmlaion/dcft/hub/models--Qwen--Qwen2.5-7B-Instruct/snapshots/a09a35458c702b33eeacc393d103063234e8bc28", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..a753841 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.46.1" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..c635f89 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab6808a8560795ff03974cddfb6c1056bca89b2ea102e77321c6e09e675831ca +size 4877660776 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..2904b7e --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49f85741168a2d9331fb0826b5c31d3b2f4f44ae2a5e3ab47db1aeaf780bb116 +size 4932751008 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..f008203 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9b3a479cd7dc8376d2623cfbfc5a9d8fc7910a8166894902c58dc70483fe289 +size 4330865200 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..f127f07 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaf402a623f48e0a991bc807f69eea57e1754e94bc3086bdc6d871b8a85dc798 +size 1089994880 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..6ca5084 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 15231233024 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/runs/Apr28_06-15-44_jpbot-001-41.jupiter.internal/events.out.tfevents.1745813765.jpbot-001-41.jupiter.internal.2338209.0 b/runs/Apr28_06-15-44_jpbot-001-41.jupiter.internal/events.out.tfevents.1745813765.jpbot-001-41.jupiter.internal.2338209.0 new file mode 100644 index 0000000..1b6df3d --- /dev/null +++ b/runs/Apr28_06-15-44_jpbot-001-41.jupiter.internal/events.out.tfevents.1745813765.jpbot-001-41.jupiter.internal.2338209.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6be7a45dc41f7f874dc9070ae761f2023f17057617edda0497b61250d48a24f4 +size 32803 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..17305b3 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/start_end.json b/start_end.json new file mode 100644 index 0000000..6e8ca8c --- /dev/null +++ b/start_end.json @@ -0,0 +1 @@ +{"start_time": "2025-04-28 06:15:44", "end_time": "2025-04-28 06:37:52"} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..51ebb3b --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa +size 11421896 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..b84f53a --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,208 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..fd7c9b7 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 13.0, + "total_flos": 3.2858174247665664e+16, + "train_loss": 0.2939408891905959, + "train_runtime": 1300.2962, + "train_samples_per_second": 3.159, + "train_steps_per_second": 0.1 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..85e3404 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,131 @@ +{"current_steps": 1, "total_steps": 130, "loss": 1.0626, "lr": 7.692307692307694e-07, "epoch": 0.1, "percentage": 0.77, "elapsed_time": "0:00:15", "remaining_time": "0:32:34"} +{"current_steps": 2, "total_steps": 130, "loss": 1.0435, "lr": 1.5384615384615387e-06, "epoch": 0.2, "percentage": 1.54, "elapsed_time": "0:00:24", "remaining_time": "0:26:26"} +{"current_steps": 3, "total_steps": 130, "loss": 1.0193, "lr": 2.307692307692308e-06, "epoch": 0.3, "percentage": 2.31, "elapsed_time": "0:00:31", "remaining_time": "0:22:07"} +{"current_steps": 4, "total_steps": 130, "loss": 1.02, "lr": 3.0769230769230774e-06, "epoch": 0.4, "percentage": 3.08, "elapsed_time": "0:00:38", "remaining_time": "0:20:27"} +{"current_steps": 5, "total_steps": 130, "loss": 1.0348, "lr": 3.846153846153847e-06, "epoch": 0.5, "percentage": 3.85, "elapsed_time": "0:00:47", "remaining_time": "0:19:45"} +{"current_steps": 6, "total_steps": 130, "loss": 0.954, "lr": 4.615384615384616e-06, "epoch": 0.6, "percentage": 4.62, "elapsed_time": "0:00:54", "remaining_time": "0:18:40"} +{"current_steps": 7, "total_steps": 130, "loss": 0.9604, "lr": 5.384615384615385e-06, "epoch": 0.7, "percentage": 5.38, "elapsed_time": "0:01:01", "remaining_time": "0:18:02"} +{"current_steps": 8, "total_steps": 130, "loss": 0.8856, "lr": 6.153846153846155e-06, "epoch": 0.8, "percentage": 6.15, "elapsed_time": "0:01:08", "remaining_time": "0:17:21"} +{"current_steps": 9, "total_steps": 130, "loss": 0.8593, "lr": 6.923076923076923e-06, "epoch": 0.9, "percentage": 6.92, "elapsed_time": "0:01:17", "remaining_time": "0:17:17"} +{"current_steps": 10, "total_steps": 130, "loss": 0.8837, "lr": 7.692307692307694e-06, "epoch": 1.0, "percentage": 7.69, "elapsed_time": "0:01:27", "remaining_time": "0:17:27"} +{"current_steps": 11, "total_steps": 130, "loss": 0.8455, "lr": 8.461538461538462e-06, "epoch": 1.1, "percentage": 8.46, "elapsed_time": "0:01:58", "remaining_time": "0:21:23"} +{"current_steps": 12, "total_steps": 130, "loss": 0.7795, "lr": 9.230769230769232e-06, "epoch": 1.2, "percentage": 9.23, "elapsed_time": "0:02:06", "remaining_time": "0:20:45"} +{"current_steps": 13, "total_steps": 130, "loss": 0.8492, "lr": 1e-05, "epoch": 1.3, "percentage": 10.0, "elapsed_time": "0:02:14", "remaining_time": "0:20:10"} +{"current_steps": 14, "total_steps": 130, "loss": 0.8569, "lr": 9.998197638354428e-06, "epoch": 1.4, "percentage": 10.77, "elapsed_time": "0:02:21", "remaining_time": "0:19:31"} +{"current_steps": 15, "total_steps": 130, "loss": 0.771, "lr": 9.992791852820709e-06, "epoch": 1.5, "percentage": 11.54, "elapsed_time": "0:02:28", "remaining_time": "0:18:55"} +{"current_steps": 16, "total_steps": 130, "loss": 0.7751, "lr": 9.983786540671052e-06, "epoch": 1.6, "percentage": 12.31, "elapsed_time": "0:02:36", "remaining_time": "0:18:33"} +{"current_steps": 17, "total_steps": 130, "loss": 0.7374, "lr": 9.971188194237141e-06, "epoch": 1.7, "percentage": 13.08, "elapsed_time": "0:02:43", "remaining_time": "0:18:07"} +{"current_steps": 18, "total_steps": 130, "loss": 0.7825, "lr": 9.955005896229543e-06, "epoch": 1.8, "percentage": 13.85, "elapsed_time": "0:02:49", "remaining_time": "0:17:33"} +{"current_steps": 19, "total_steps": 130, "loss": 0.7741, "lr": 9.935251313189564e-06, "epoch": 1.9, "percentage": 14.62, "elapsed_time": "0:02:58", "remaining_time": "0:17:25"} +{"current_steps": 20, "total_steps": 130, "loss": 0.7026, "lr": 9.911938687078324e-06, "epoch": 2.0, "percentage": 15.38, "elapsed_time": "0:03:06", "remaining_time": "0:17:04"} +{"current_steps": 21, "total_steps": 130, "loss": 0.6487, "lr": 9.885084825009085e-06, "epoch": 2.1, "percentage": 16.15, "elapsed_time": "0:03:28", "remaining_time": "0:18:02"} +{"current_steps": 22, "total_steps": 130, "loss": 0.6709, "lr": 9.854709087130261e-06, "epoch": 2.2, "percentage": 16.92, "elapsed_time": "0:03:36", "remaining_time": "0:17:42"} +{"current_steps": 23, "total_steps": 130, "loss": 0.6251, "lr": 9.820833372667813e-06, "epoch": 2.3, "percentage": 17.69, "elapsed_time": "0:03:41", "remaining_time": "0:17:11"} +{"current_steps": 24, "total_steps": 130, "loss": 0.5849, "lr": 9.783482104137127e-06, "epoch": 2.4, "percentage": 18.46, "elapsed_time": "0:03:50", "remaining_time": "0:17:00"} +{"current_steps": 25, "total_steps": 130, "loss": 0.6776, "lr": 9.742682209735727e-06, "epoch": 2.5, "percentage": 19.23, "elapsed_time": "0:04:00", "remaining_time": "0:16:49"} +{"current_steps": 26, "total_steps": 130, "loss": 0.6056, "lr": 9.698463103929542e-06, "epoch": 2.6, "percentage": 20.0, "elapsed_time": "0:04:05", "remaining_time": "0:16:23"} +{"current_steps": 27, "total_steps": 130, "loss": 0.6272, "lr": 9.650856666246693e-06, "epoch": 2.7, "percentage": 20.77, "elapsed_time": "0:04:11", "remaining_time": "0:15:59"} +{"current_steps": 28, "total_steps": 130, "loss": 0.6117, "lr": 9.599897218294122e-06, "epoch": 2.8, "percentage": 21.54, "elapsed_time": "0:04:21", "remaining_time": "0:15:53"} +{"current_steps": 29, "total_steps": 130, "loss": 0.6137, "lr": 9.54562149901362e-06, "epoch": 2.9, "percentage": 22.31, "elapsed_time": "0:04:31", "remaining_time": "0:15:47"} +{"current_steps": 30, "total_steps": 130, "loss": 0.5514, "lr": 9.488068638195072e-06, "epoch": 3.0, "percentage": 23.08, "elapsed_time": "0:04:40", "remaining_time": "0:15:35"} +{"current_steps": 31, "total_steps": 130, "loss": 0.4937, "lr": 9.427280128266049e-06, "epoch": 3.1, "percentage": 23.85, "elapsed_time": "0:04:59", "remaining_time": "0:15:54"} +{"current_steps": 32, "total_steps": 130, "loss": 0.5146, "lr": 9.363299794378072e-06, "epoch": 3.2, "percentage": 24.62, "elapsed_time": "0:05:08", "remaining_time": "0:15:45"} +{"current_steps": 33, "total_steps": 130, "loss": 0.4297, "lr": 9.296173762811084e-06, "epoch": 3.3, "percentage": 25.38, "elapsed_time": "0:05:15", "remaining_time": "0:15:26"} +{"current_steps": 34, "total_steps": 130, "loss": 0.4366, "lr": 9.225950427718974e-06, "epoch": 3.4, "percentage": 26.15, "elapsed_time": "0:05:24", "remaining_time": "0:15:17"} +{"current_steps": 35, "total_steps": 130, "loss": 0.4638, "lr": 9.152680416240059e-06, "epoch": 3.5, "percentage": 26.92, "elapsed_time": "0:05:33", "remaining_time": "0:15:04"} +{"current_steps": 36, "total_steps": 130, "loss": 0.4385, "lr": 9.076416551997721e-06, "epoch": 3.6, "percentage": 27.69, "elapsed_time": "0:05:42", "remaining_time": "0:14:55"} +{"current_steps": 37, "total_steps": 130, "loss": 0.4564, "lr": 8.997213817017508e-06, "epoch": 3.7, "percentage": 28.46, "elapsed_time": "0:05:49", "remaining_time": "0:14:39"} +{"current_steps": 38, "total_steps": 130, "loss": 0.4638, "lr": 8.915129312088112e-06, "epoch": 3.8, "percentage": 29.23, "elapsed_time": "0:05:54", "remaining_time": "0:14:18"} +{"current_steps": 39, "total_steps": 130, "loss": 0.4355, "lr": 8.83022221559489e-06, "epoch": 3.9, "percentage": 30.0, "elapsed_time": "0:06:03", "remaining_time": "0:14:09"} +{"current_steps": 40, "total_steps": 130, "loss": 0.4131, "lr": 8.742553740855507e-06, "epoch": 4.0, "percentage": 30.77, "elapsed_time": "0:06:11", "remaining_time": "0:13:54"} +{"current_steps": 41, "total_steps": 130, "loss": 0.3543, "lr": 8.652187091988516e-06, "epoch": 4.1, "percentage": 31.54, "elapsed_time": "0:06:32", "remaining_time": "0:14:11"} +{"current_steps": 42, "total_steps": 130, "loss": 0.3948, "lr": 8.559187418346703e-06, "epoch": 4.2, "percentage": 32.31, "elapsed_time": "0:06:40", "remaining_time": "0:13:59"} +{"current_steps": 43, "total_steps": 130, "loss": 0.3216, "lr": 8.463621767547998e-06, "epoch": 4.3, "percentage": 33.08, "elapsed_time": "0:06:47", "remaining_time": "0:13:43"} +{"current_steps": 44, "total_steps": 130, "loss": 0.3338, "lr": 8.36555903713785e-06, "epoch": 4.4, "percentage": 33.85, "elapsed_time": "0:06:52", "remaining_time": "0:13:27"} +{"current_steps": 45, "total_steps": 130, "loss": 0.3533, "lr": 8.265069924917925e-06, "epoch": 4.5, "percentage": 34.62, "elapsed_time": "0:06:58", "remaining_time": "0:13:11"} +{"current_steps": 46, "total_steps": 130, "loss": 0.3303, "lr": 8.162226877976886e-06, "epoch": 4.6, "percentage": 35.38, "elapsed_time": "0:07:08", "remaining_time": "0:13:01"} +{"current_steps": 47, "total_steps": 130, "loss": 0.3076, "lr": 8.057104040460062e-06, "epoch": 4.7, "percentage": 36.15, "elapsed_time": "0:07:17", "remaining_time": "0:12:53"} +{"current_steps": 48, "total_steps": 130, "loss": 0.2905, "lr": 7.949777200115617e-06, "epoch": 4.8, "percentage": 36.92, "elapsed_time": "0:07:27", "remaining_time": "0:12:44"} +{"current_steps": 49, "total_steps": 130, "loss": 0.3255, "lr": 7.84032373365578e-06, "epoch": 4.9, "percentage": 37.69, "elapsed_time": "0:07:34", "remaining_time": "0:12:32"} +{"current_steps": 50, "total_steps": 130, "loss": 0.2633, "lr": 7.728822550972523e-06, "epoch": 5.0, "percentage": 38.46, "elapsed_time": "0:07:41", "remaining_time": "0:12:18"} +{"current_steps": 51, "total_steps": 130, "loss": 0.2282, "lr": 7.615354038247889e-06, "epoch": 5.1, "percentage": 39.23, "elapsed_time": "0:08:05", "remaining_time": "0:12:31"} +{"current_steps": 52, "total_steps": 130, "loss": 0.2057, "lr": 7.500000000000001e-06, "epoch": 5.2, "percentage": 40.0, "elapsed_time": "0:08:09", "remaining_time": "0:12:13"} +{"current_steps": 53, "total_steps": 130, "loss": 0.2344, "lr": 7.382843600106539e-06, "epoch": 5.3, "percentage": 40.77, "elapsed_time": "0:08:16", "remaining_time": "0:12:01"} +{"current_steps": 54, "total_steps": 130, "loss": 0.2055, "lr": 7.263969301848188e-06, "epoch": 5.4, "percentage": 41.54, "elapsed_time": "0:08:23", "remaining_time": "0:11:49"} +{"current_steps": 55, "total_steps": 130, "loss": 0.2188, "lr": 7.143462807015271e-06, "epoch": 5.5, "percentage": 42.31, "elapsed_time": "0:08:32", "remaining_time": "0:11:39"} +{"current_steps": 56, "total_steps": 130, "loss": 0.2022, "lr": 7.021410994121525e-06, "epoch": 5.6, "percentage": 43.08, "elapsed_time": "0:08:40", "remaining_time": "0:11:28"} +{"current_steps": 57, "total_steps": 130, "loss": 0.2313, "lr": 6.897901855769483e-06, "epoch": 5.7, "percentage": 43.85, "elapsed_time": "0:08:51", "remaining_time": "0:11:20"} +{"current_steps": 58, "total_steps": 130, "loss": 0.1914, "lr": 6.773024435212678e-06, "epoch": 5.8, "percentage": 44.62, "elapsed_time": "0:08:57", "remaining_time": "0:11:06"} +{"current_steps": 59, "total_steps": 130, "loss": 0.2626, "lr": 6.646868762160399e-06, "epoch": 5.9, "percentage": 45.38, "elapsed_time": "0:09:06", "remaining_time": "0:10:57"} +{"current_steps": 60, "total_steps": 130, "loss": 0.2043, "lr": 6.519525787871235e-06, "epoch": 6.0, "percentage": 46.15, "elapsed_time": "0:09:14", "remaining_time": "0:10:46"} +{"current_steps": 61, "total_steps": 130, "loss": 0.1684, "lr": 6.391087319582264e-06, "epoch": 6.1, "percentage": 46.92, "elapsed_time": "0:09:35", "remaining_time": "0:10:51"} +{"current_steps": 62, "total_steps": 130, "loss": 0.1655, "lr": 6.261645954321109e-06, "epoch": 6.2, "percentage": 47.69, "elapsed_time": "0:09:43", "remaining_time": "0:10:40"} +{"current_steps": 63, "total_steps": 130, "loss": 0.1035, "lr": 6.131295012148613e-06, "epoch": 6.3, "percentage": 48.46, "elapsed_time": "0:09:49", "remaining_time": "0:10:27"} +{"current_steps": 64, "total_steps": 130, "loss": 0.1257, "lr": 6.000128468880223e-06, "epoch": 6.4, "percentage": 49.23, "elapsed_time": "0:09:58", "remaining_time": "0:10:16"} +{"current_steps": 65, "total_steps": 130, "loss": 0.1591, "lr": 5.8682408883346535e-06, "epoch": 6.5, "percentage": 50.0, "elapsed_time": "0:10:08", "remaining_time": "0:10:08"} +{"current_steps": 66, "total_steps": 130, "loss": 0.1463, "lr": 5.735727354158581e-06, "epoch": 6.6, "percentage": 50.77, "elapsed_time": "0:10:14", "remaining_time": "0:09:55"} +{"current_steps": 67, "total_steps": 130, "loss": 0.157, "lr": 5.6026834012766155e-06, "epoch": 6.7, "percentage": 51.54, "elapsed_time": "0:10:21", "remaining_time": "0:09:44"} +{"current_steps": 68, "total_steps": 130, "loss": 0.1524, "lr": 5.469204947015897e-06, "epoch": 6.8, "percentage": 52.31, "elapsed_time": "0:10:29", "remaining_time": "0:09:34"} +{"current_steps": 69, "total_steps": 130, "loss": 0.1188, "lr": 5.335388221955012e-06, "epoch": 6.9, "percentage": 53.08, "elapsed_time": "0:10:40", "remaining_time": "0:09:26"} +{"current_steps": 70, "total_steps": 130, "loss": 0.1194, "lr": 5.201329700547077e-06, "epoch": 7.0, "percentage": 53.85, "elapsed_time": "0:10:50", "remaining_time": "0:09:17"} +{"current_steps": 71, "total_steps": 130, "loss": 0.1055, "lr": 5.067126031566988e-06, "epoch": 7.1, "percentage": 54.62, "elapsed_time": "0:11:11", "remaining_time": "0:09:18"} +{"current_steps": 72, "total_steps": 130, "loss": 0.0691, "lr": 4.932873968433014e-06, "epoch": 7.2, "percentage": 55.38, "elapsed_time": "0:11:16", "remaining_time": "0:09:04"} +{"current_steps": 73, "total_steps": 130, "loss": 0.1127, "lr": 4.798670299452926e-06, "epoch": 7.3, "percentage": 56.15, "elapsed_time": "0:11:24", "remaining_time": "0:08:54"} +{"current_steps": 74, "total_steps": 130, "loss": 0.1064, "lr": 4.664611778044988e-06, "epoch": 7.4, "percentage": 56.92, "elapsed_time": "0:11:33", "remaining_time": "0:08:44"} +{"current_steps": 75, "total_steps": 130, "loss": 0.0528, "lr": 4.530795052984104e-06, "epoch": 7.5, "percentage": 57.69, "elapsed_time": "0:11:41", "remaining_time": "0:08:34"} +{"current_steps": 76, "total_steps": 130, "loss": 0.1128, "lr": 4.397316598723385e-06, "epoch": 7.6, "percentage": 58.46, "elapsed_time": "0:11:50", "remaining_time": "0:08:25"} +{"current_steps": 77, "total_steps": 130, "loss": 0.0804, "lr": 4.264272645841419e-06, "epoch": 7.7, "percentage": 59.23, "elapsed_time": "0:11:58", "remaining_time": "0:08:14"} +{"current_steps": 78, "total_steps": 130, "loss": 0.1166, "lr": 4.131759111665349e-06, "epoch": 7.8, "percentage": 60.0, "elapsed_time": "0:12:06", "remaining_time": "0:08:04"} +{"current_steps": 79, "total_steps": 130, "loss": 0.0954, "lr": 3.999871531119779e-06, "epoch": 7.9, "percentage": 60.77, "elapsed_time": "0:12:13", "remaining_time": "0:07:53"} +{"current_steps": 80, "total_steps": 130, "loss": 0.0996, "lr": 3.86870498785139e-06, "epoch": 8.0, "percentage": 61.54, "elapsed_time": "0:12:21", "remaining_time": "0:07:43"} +{"current_steps": 81, "total_steps": 130, "loss": 0.0643, "lr": 3.7383540456788915e-06, "epoch": 8.1, "percentage": 62.31, "elapsed_time": "0:12:43", "remaining_time": "0:07:41"} +{"current_steps": 82, "total_steps": 130, "loss": 0.0923, "lr": 3.6089126804177373e-06, "epoch": 8.2, "percentage": 63.08, "elapsed_time": "0:12:51", "remaining_time": "0:07:31"} +{"current_steps": 83, "total_steps": 130, "loss": 0.0444, "lr": 3.480474212128766e-06, "epoch": 8.3, "percentage": 63.85, "elapsed_time": "0:12:58", "remaining_time": "0:07:20"} +{"current_steps": 84, "total_steps": 130, "loss": 0.0742, "lr": 3.3531312378396026e-06, "epoch": 8.4, "percentage": 64.62, "elapsed_time": "0:13:08", "remaining_time": "0:07:11"} +{"current_steps": 85, "total_steps": 130, "loss": 0.0588, "lr": 3.226975564787322e-06, "epoch": 8.5, "percentage": 65.38, "elapsed_time": "0:13:18", "remaining_time": "0:07:02"} +{"current_steps": 86, "total_steps": 130, "loss": 0.0505, "lr": 3.1020981442305187e-06, "epoch": 8.6, "percentage": 66.15, "elapsed_time": "0:13:24", "remaining_time": "0:06:51"} +{"current_steps": 87, "total_steps": 130, "loss": 0.0595, "lr": 2.978589005878476e-06, "epoch": 8.7, "percentage": 66.92, "elapsed_time": "0:13:32", "remaining_time": "0:06:41"} +{"current_steps": 88, "total_steps": 130, "loss": 0.0828, "lr": 2.8565371929847286e-06, "epoch": 8.8, "percentage": 67.69, "elapsed_time": "0:13:41", "remaining_time": "0:06:32"} +{"current_steps": 89, "total_steps": 130, "loss": 0.0546, "lr": 2.736030698151815e-06, "epoch": 8.9, "percentage": 68.46, "elapsed_time": "0:13:49", "remaining_time": "0:06:22"} +{"current_steps": 90, "total_steps": 130, "loss": 0.0663, "lr": 2.6171563998934605e-06, "epoch": 9.0, "percentage": 69.23, "elapsed_time": "0:13:57", "remaining_time": "0:06:12"} +{"current_steps": 91, "total_steps": 130, "loss": 0.0448, "lr": 2.5000000000000015e-06, "epoch": 9.1, "percentage": 70.0, "elapsed_time": "0:14:26", "remaining_time": "0:06:11"} +{"current_steps": 92, "total_steps": 130, "loss": 0.0459, "lr": 2.384645961752113e-06, "epoch": 9.2, "percentage": 70.77, "elapsed_time": "0:14:34", "remaining_time": "0:06:01"} +{"current_steps": 93, "total_steps": 130, "loss": 0.0434, "lr": 2.2711774490274767e-06, "epoch": 9.3, "percentage": 71.54, "elapsed_time": "0:14:42", "remaining_time": "0:05:51"} +{"current_steps": 94, "total_steps": 130, "loss": 0.0515, "lr": 2.159676266344222e-06, "epoch": 9.4, "percentage": 72.31, "elapsed_time": "0:14:51", "remaining_time": "0:05:41"} +{"current_steps": 95, "total_steps": 130, "loss": 0.0382, "lr": 2.050222799884387e-06, "epoch": 9.5, "percentage": 73.08, "elapsed_time": "0:14:57", "remaining_time": "0:05:30"} +{"current_steps": 96, "total_steps": 130, "loss": 0.0463, "lr": 1.942895959539939e-06, "epoch": 9.6, "percentage": 73.85, "elapsed_time": "0:15:01", "remaining_time": "0:05:19"} +{"current_steps": 97, "total_steps": 130, "loss": 0.0447, "lr": 1.8377731220231144e-06, "epoch": 9.7, "percentage": 74.62, "elapsed_time": "0:15:11", "remaining_time": "0:05:10"} +{"current_steps": 98, "total_steps": 130, "loss": 0.0517, "lr": 1.7349300750820758e-06, "epoch": 9.8, "percentage": 75.38, "elapsed_time": "0:15:18", "remaining_time": "0:05:00"} +{"current_steps": 99, "total_steps": 130, "loss": 0.0585, "lr": 1.6344409628621482e-06, "epoch": 9.9, "percentage": 76.15, "elapsed_time": "0:15:28", "remaining_time": "0:04:50"} +{"current_steps": 100, "total_steps": 130, "loss": 0.0242, "lr": 1.5363782324520033e-06, "epoch": 10.0, "percentage": 76.92, "elapsed_time": "0:15:32", "remaining_time": "0:04:39"} +{"current_steps": 101, "total_steps": 130, "loss": 0.0357, "lr": 1.4408125816532981e-06, "epoch": 10.1, "percentage": 77.69, "elapsed_time": "0:16:18", "remaining_time": "0:04:41"} +{"current_steps": 102, "total_steps": 130, "loss": 0.0214, "lr": 1.347812908011485e-06, "epoch": 10.2, "percentage": 78.46, "elapsed_time": "0:16:28", "remaining_time": "0:04:31"} +{"current_steps": 103, "total_steps": 130, "loss": 0.0267, "lr": 1.257446259144494e-06, "epoch": 10.3, "percentage": 79.23, "elapsed_time": "0:16:35", "remaining_time": "0:04:20"} +{"current_steps": 104, "total_steps": 130, "loss": 0.0665, "lr": 1.1697777844051105e-06, "epoch": 10.4, "percentage": 80.0, "elapsed_time": "0:16:43", "remaining_time": "0:04:10"} +{"current_steps": 105, "total_steps": 130, "loss": 0.0258, "lr": 1.0848706879118893e-06, "epoch": 10.5, "percentage": 80.77, "elapsed_time": "0:16:49", "remaining_time": "0:04:00"} +{"current_steps": 106, "total_steps": 130, "loss": 0.0171, "lr": 1.0027861829824953e-06, "epoch": 10.6, "percentage": 81.54, "elapsed_time": "0:16:53", "remaining_time": "0:03:49"} +{"current_steps": 107, "total_steps": 130, "loss": 0.0422, "lr": 9.235834480022788e-07, "epoch": 10.7, "percentage": 82.31, "elapsed_time": "0:17:02", "remaining_time": "0:03:39"} +{"current_steps": 108, "total_steps": 130, "loss": 0.0474, "lr": 8.473195837599419e-07, "epoch": 10.8, "percentage": 83.08, "elapsed_time": "0:17:09", "remaining_time": "0:03:29"} +{"current_steps": 109, "total_steps": 130, "loss": 0.0387, "lr": 7.740495722810271e-07, "epoch": 10.9, "percentage": 83.85, "elapsed_time": "0:17:18", "remaining_time": "0:03:20"} +{"current_steps": 110, "total_steps": 130, "loss": 0.0265, "lr": 7.03826237188916e-07, "epoch": 11.0, "percentage": 84.62, "elapsed_time": "0:17:25", "remaining_time": "0:03:10"} +{"current_steps": 111, "total_steps": 130, "loss": 0.0206, "lr": 6.367002056219285e-07, "epoch": 11.1, "percentage": 85.38, "elapsed_time": "0:18:05", "remaining_time": "0:03:05"} +{"current_steps": 112, "total_steps": 130, "loss": 0.0346, "lr": 5.727198717339511e-07, "epoch": 11.2, "percentage": 86.15, "elapsed_time": "0:18:13", "remaining_time": "0:02:55"} +{"current_steps": 113, "total_steps": 130, "loss": 0.0263, "lr": 5.119313618049309e-07, "epoch": 11.3, "percentage": 86.92, "elapsed_time": "0:18:22", "remaining_time": "0:02:45"} +{"current_steps": 114, "total_steps": 130, "loss": 0.0262, "lr": 4.54378500986381e-07, "epoch": 11.4, "percentage": 87.69, "elapsed_time": "0:18:27", "remaining_time": "0:02:35"} +{"current_steps": 115, "total_steps": 130, "loss": 0.045, "lr": 4.001027817058789e-07, "epoch": 11.5, "percentage": 88.46, "elapsed_time": "0:18:34", "remaining_time": "0:02:25"} +{"current_steps": 116, "total_steps": 130, "loss": 0.0283, "lr": 3.49143333753309e-07, "epoch": 11.6, "percentage": 89.23, "elapsed_time": "0:18:40", "remaining_time": "0:02:15"} +{"current_steps": 117, "total_steps": 130, "loss": 0.0185, "lr": 3.015368960704584e-07, "epoch": 11.7, "percentage": 90.0, "elapsed_time": "0:18:47", "remaining_time": "0:02:05"} +{"current_steps": 118, "total_steps": 130, "loss": 0.0422, "lr": 2.573177902642726e-07, "epoch": 11.8, "percentage": 90.77, "elapsed_time": "0:18:56", "remaining_time": "0:01:55"} +{"current_steps": 119, "total_steps": 130, "loss": 0.0444, "lr": 2.1651789586287442e-07, "epoch": 11.9, "percentage": 91.54, "elapsed_time": "0:19:07", "remaining_time": "0:01:46"} +{"current_steps": 120, "total_steps": 130, "loss": 0.0162, "lr": 1.7916662733218848e-07, "epoch": 12.0, "percentage": 92.31, "elapsed_time": "0:19:14", "remaining_time": "0:01:36"} +{"current_steps": 121, "total_steps": 130, "loss": 0.0111, "lr": 1.4529091286973994e-07, "epoch": 12.1, "percentage": 93.08, "elapsed_time": "0:19:41", "remaining_time": "0:01:27"} +{"current_steps": 122, "total_steps": 130, "loss": 0.0211, "lr": 1.1491517499091498e-07, "epoch": 12.2, "percentage": 93.85, "elapsed_time": "0:19:46", "remaining_time": "0:01:17"} +{"current_steps": 123, "total_steps": 130, "loss": 0.0489, "lr": 8.80613129216762e-08, "epoch": 12.3, "percentage": 94.62, "elapsed_time": "0:19:56", "remaining_time": "0:01:08"} +{"current_steps": 124, "total_steps": 130, "loss": 0.0338, "lr": 6.474868681043578e-08, "epoch": 12.4, "percentage": 95.38, "elapsed_time": "0:20:03", "remaining_time": "0:00:58"} +{"current_steps": 125, "total_steps": 130, "loss": 0.0136, "lr": 4.499410377045765e-08, "epoch": 12.5, "percentage": 96.15, "elapsed_time": "0:20:08", "remaining_time": "0:00:48"} +{"current_steps": 126, "total_steps": 130, "loss": 0.0236, "lr": 2.8811805762860578e-08, "epoch": 12.6, "percentage": 96.92, "elapsed_time": "0:20:19", "remaining_time": "0:00:38"} +{"current_steps": 127, "total_steps": 130, "loss": 0.0334, "lr": 1.6213459328950355e-08, "epoch": 12.7, "percentage": 97.69, "elapsed_time": "0:20:28", "remaining_time": "0:00:29"} +{"current_steps": 128, "total_steps": 130, "loss": 0.0412, "lr": 7.2081471792911914e-09, "epoch": 12.8, "percentage": 98.46, "elapsed_time": "0:20:37", "remaining_time": "0:00:19"} +{"current_steps": 129, "total_steps": 130, "loss": 0.0253, "lr": 1.8023616455731253e-09, "epoch": 12.9, "percentage": 99.23, "elapsed_time": "0:20:43", "remaining_time": "0:00:09"} +{"current_steps": 130, "total_steps": 130, "loss": 0.0263, "lr": 0.0, "epoch": 13.0, "percentage": 100.0, "elapsed_time": "0:20:49", "remaining_time": "0:00:00"} +{"current_steps": 130, "total_steps": 130, "epoch": 13.0, "percentage": 100.0, "elapsed_time": "0:21:39", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..be21cca --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,952 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 13.0, + "eval_steps": 500, + "global_step": 130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 6.733106782508708, + "learning_rate": 7.692307692307694e-07, + "loss": 1.0626, + "step": 1 + }, + { + "epoch": 0.2, + "grad_norm": 6.149285803125056, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.0435, + "step": 2 + }, + { + "epoch": 0.3, + "grad_norm": 6.152320728736608, + "learning_rate": 2.307692307692308e-06, + "loss": 1.0193, + "step": 3 + }, + { + "epoch": 0.4, + "grad_norm": 6.517614170207162, + "learning_rate": 3.0769230769230774e-06, + "loss": 1.02, + "step": 4 + }, + { + "epoch": 0.5, + "grad_norm": 5.305198033715196, + "learning_rate": 3.846153846153847e-06, + "loss": 1.0348, + "step": 5 + }, + { + "epoch": 0.6, + "grad_norm": 3.8592003927171907, + "learning_rate": 4.615384615384616e-06, + "loss": 0.954, + "step": 6 + }, + { + "epoch": 0.7, + "grad_norm": 2.800854752532226, + "learning_rate": 5.384615384615385e-06, + "loss": 0.9604, + "step": 7 + }, + { + "epoch": 0.8, + "grad_norm": 3.5023896682586497, + "learning_rate": 6.153846153846155e-06, + "loss": 0.8856, + "step": 8 + }, + { + "epoch": 0.9, + "grad_norm": 4.1684273728378685, + "learning_rate": 6.923076923076923e-06, + "loss": 0.8593, + "step": 9 + }, + { + "epoch": 1.0, + "grad_norm": 4.8032937097662, + "learning_rate": 7.692307692307694e-06, + "loss": 0.8837, + "step": 10 + }, + { + "epoch": 1.1, + "grad_norm": 4.082576868292361, + "learning_rate": 8.461538461538462e-06, + "loss": 0.8455, + "step": 11 + }, + { + "epoch": 1.2, + "grad_norm": 3.907890503246759, + "learning_rate": 9.230769230769232e-06, + "loss": 0.7795, + "step": 12 + }, + { + "epoch": 1.3, + "grad_norm": 3.5914196704208172, + "learning_rate": 1e-05, + "loss": 0.8492, + "step": 13 + }, + { + "epoch": 1.4, + "grad_norm": 2.942570514389922, + "learning_rate": 9.998197638354428e-06, + "loss": 0.8569, + "step": 14 + }, + { + "epoch": 1.5, + "grad_norm": 2.2999462111794804, + "learning_rate": 9.992791852820709e-06, + "loss": 0.771, + "step": 15 + }, + { + "epoch": 1.6, + "grad_norm": 2.6948422206665636, + "learning_rate": 9.983786540671052e-06, + "loss": 0.7751, + "step": 16 + }, + { + "epoch": 1.7, + "grad_norm": 2.681033931109484, + "learning_rate": 9.971188194237141e-06, + "loss": 0.7374, + "step": 17 + }, + { + "epoch": 1.8, + "grad_norm": 2.742683686105219, + "learning_rate": 9.955005896229543e-06, + "loss": 0.7825, + "step": 18 + }, + { + "epoch": 1.9, + "grad_norm": 2.071739361645055, + "learning_rate": 9.935251313189564e-06, + "loss": 0.7741, + "step": 19 + }, + { + "epoch": 2.0, + "grad_norm": 1.8767253510635002, + "learning_rate": 9.911938687078324e-06, + "loss": 0.7026, + "step": 20 + }, + { + "epoch": 2.1, + "grad_norm": 1.7974374045731076, + "learning_rate": 9.885084825009085e-06, + "loss": 0.6487, + "step": 21 + }, + { + "epoch": 2.2, + "grad_norm": 1.7364077012351282, + "learning_rate": 9.854709087130261e-06, + "loss": 0.6709, + "step": 22 + }, + { + "epoch": 2.3, + "grad_norm": 1.3747907551026266, + "learning_rate": 9.820833372667813e-06, + "loss": 0.6251, + "step": 23 + }, + { + "epoch": 2.4, + "grad_norm": 1.2086788755323006, + "learning_rate": 9.783482104137127e-06, + "loss": 0.5849, + "step": 24 + }, + { + "epoch": 2.5, + "grad_norm": 1.2212416207318195, + "learning_rate": 9.742682209735727e-06, + "loss": 0.6776, + "step": 25 + }, + { + "epoch": 2.6, + "grad_norm": 1.3324486836498435, + "learning_rate": 9.698463103929542e-06, + "loss": 0.6056, + "step": 26 + }, + { + "epoch": 2.7, + "grad_norm": 1.2812363796358743, + "learning_rate": 9.650856666246693e-06, + "loss": 0.6272, + "step": 27 + }, + { + "epoch": 2.8, + "grad_norm": 1.3121250186867432, + "learning_rate": 9.599897218294122e-06, + "loss": 0.6117, + "step": 28 + }, + { + "epoch": 2.9, + "grad_norm": 1.1416700701522662, + "learning_rate": 9.54562149901362e-06, + "loss": 0.6137, + "step": 29 + }, + { + "epoch": 3.0, + "grad_norm": 0.8638570875229467, + "learning_rate": 9.488068638195072e-06, + "loss": 0.5514, + "step": 30 + }, + { + "epoch": 3.1, + "grad_norm": 1.1680224672840247, + "learning_rate": 9.427280128266049e-06, + "loss": 0.4937, + "step": 31 + }, + { + "epoch": 3.2, + "grad_norm": 0.9985846334069853, + "learning_rate": 9.363299794378072e-06, + "loss": 0.5146, + "step": 32 + }, + { + "epoch": 3.3, + "grad_norm": 0.993554703522936, + "learning_rate": 9.296173762811084e-06, + "loss": 0.4297, + "step": 33 + }, + { + "epoch": 3.4, + "grad_norm": 1.0979981540776833, + "learning_rate": 9.225950427718974e-06, + "loss": 0.4366, + "step": 34 + }, + { + "epoch": 3.5, + "grad_norm": 0.9947660773449873, + "learning_rate": 9.152680416240059e-06, + "loss": 0.4638, + "step": 35 + }, + { + "epoch": 3.6, + "grad_norm": 1.22170267986729, + "learning_rate": 9.076416551997721e-06, + "loss": 0.4385, + "step": 36 + }, + { + "epoch": 3.7, + "grad_norm": 1.2669424656892199, + "learning_rate": 8.997213817017508e-06, + "loss": 0.4564, + "step": 37 + }, + { + "epoch": 3.8, + "grad_norm": 1.1385305262473597, + "learning_rate": 8.915129312088112e-06, + "loss": 0.4638, + "step": 38 + }, + { + "epoch": 3.9, + "grad_norm": 0.85752544440622, + "learning_rate": 8.83022221559489e-06, + "loss": 0.4355, + "step": 39 + }, + { + "epoch": 4.0, + "grad_norm": 0.9350416528577298, + "learning_rate": 8.742553740855507e-06, + "loss": 0.4131, + "step": 40 + }, + { + "epoch": 4.1, + "grad_norm": 1.0044872736721613, + "learning_rate": 8.652187091988516e-06, + "loss": 0.3543, + "step": 41 + }, + { + "epoch": 4.2, + "grad_norm": 0.9158552416245116, + "learning_rate": 8.559187418346703e-06, + "loss": 0.3948, + "step": 42 + }, + { + "epoch": 4.3, + "grad_norm": 0.9500005615720003, + "learning_rate": 8.463621767547998e-06, + "loss": 0.3216, + "step": 43 + }, + { + "epoch": 4.4, + "grad_norm": 1.1593134176446123, + "learning_rate": 8.36555903713785e-06, + "loss": 0.3338, + "step": 44 + }, + { + "epoch": 4.5, + "grad_norm": 1.036997277407171, + "learning_rate": 8.265069924917925e-06, + "loss": 0.3533, + "step": 45 + }, + { + "epoch": 4.6, + "grad_norm": 0.9081359993684659, + "learning_rate": 8.162226877976886e-06, + "loss": 0.3303, + "step": 46 + }, + { + "epoch": 4.7, + "grad_norm": 1.0357743695490003, + "learning_rate": 8.057104040460062e-06, + "loss": 0.3076, + "step": 47 + }, + { + "epoch": 4.8, + "grad_norm": 1.003725556135553, + "learning_rate": 7.949777200115617e-06, + "loss": 0.2905, + "step": 48 + }, + { + "epoch": 4.9, + "grad_norm": 0.8306334207263184, + "learning_rate": 7.84032373365578e-06, + "loss": 0.3255, + "step": 49 + }, + { + "epoch": 5.0, + "grad_norm": 0.9674484368296132, + "learning_rate": 7.728822550972523e-06, + "loss": 0.2633, + "step": 50 + }, + { + "epoch": 5.1, + "grad_norm": 0.8974490960860493, + "learning_rate": 7.615354038247889e-06, + "loss": 0.2282, + "step": 51 + }, + { + "epoch": 5.2, + "grad_norm": 0.8365328600849284, + "learning_rate": 7.500000000000001e-06, + "loss": 0.2057, + "step": 52 + }, + { + "epoch": 5.3, + "grad_norm": 1.0261442314495612, + "learning_rate": 7.382843600106539e-06, + "loss": 0.2344, + "step": 53 + }, + { + "epoch": 5.4, + "grad_norm": 1.0045141958077353, + "learning_rate": 7.263969301848188e-06, + "loss": 0.2055, + "step": 54 + }, + { + "epoch": 5.5, + "grad_norm": 0.791279076653458, + "learning_rate": 7.143462807015271e-06, + "loss": 0.2188, + "step": 55 + }, + { + "epoch": 5.6, + "grad_norm": 0.8526401510959523, + "learning_rate": 7.021410994121525e-06, + "loss": 0.2022, + "step": 56 + }, + { + "epoch": 5.7, + "grad_norm": 0.8913810238298835, + "learning_rate": 6.897901855769483e-06, + "loss": 0.2313, + "step": 57 + }, + { + "epoch": 5.8, + "grad_norm": 0.9650702324016643, + "learning_rate": 6.773024435212678e-06, + "loss": 0.1914, + "step": 58 + }, + { + "epoch": 5.9, + "grad_norm": 0.7872208893856325, + "learning_rate": 6.646868762160399e-06, + "loss": 0.2626, + "step": 59 + }, + { + "epoch": 6.0, + "grad_norm": 0.9217811368898755, + "learning_rate": 6.519525787871235e-06, + "loss": 0.2043, + "step": 60 + }, + { + "epoch": 6.1, + "grad_norm": 0.6948999282360269, + "learning_rate": 6.391087319582264e-06, + "loss": 0.1684, + "step": 61 + }, + { + "epoch": 6.2, + "grad_norm": 0.8075198651452326, + "learning_rate": 6.261645954321109e-06, + "loss": 0.1655, + "step": 62 + }, + { + "epoch": 6.3, + "grad_norm": 0.8528235582977988, + "learning_rate": 6.131295012148613e-06, + "loss": 0.1035, + "step": 63 + }, + { + "epoch": 6.4, + "grad_norm": 0.8642283154141255, + "learning_rate": 6.000128468880223e-06, + "loss": 0.1257, + "step": 64 + }, + { + "epoch": 6.5, + "grad_norm": 0.8562595202564899, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.1591, + "step": 65 + }, + { + "epoch": 6.6, + "grad_norm": 0.7667412263031629, + "learning_rate": 5.735727354158581e-06, + "loss": 0.1463, + "step": 66 + }, + { + "epoch": 6.7, + "grad_norm": 0.758115793445322, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.157, + "step": 67 + }, + { + "epoch": 6.8, + "grad_norm": 0.7474467056534099, + "learning_rate": 5.469204947015897e-06, + "loss": 0.1524, + "step": 68 + }, + { + "epoch": 6.9, + "grad_norm": 0.7093244081518412, + "learning_rate": 5.335388221955012e-06, + "loss": 0.1188, + "step": 69 + }, + { + "epoch": 7.0, + "grad_norm": 0.6945048553836589, + "learning_rate": 5.201329700547077e-06, + "loss": 0.1194, + "step": 70 + }, + { + "epoch": 7.1, + "grad_norm": 0.632963012825732, + "learning_rate": 5.067126031566988e-06, + "loss": 0.1055, + "step": 71 + }, + { + "epoch": 7.2, + "grad_norm": 0.535098574774597, + "learning_rate": 4.932873968433014e-06, + "loss": 0.0691, + "step": 72 + }, + { + "epoch": 7.3, + "grad_norm": 0.6563557436695596, + "learning_rate": 4.798670299452926e-06, + "loss": 0.1127, + "step": 73 + }, + { + "epoch": 7.4, + "grad_norm": 0.6600862624801698, + "learning_rate": 4.664611778044988e-06, + "loss": 0.1064, + "step": 74 + }, + { + "epoch": 7.5, + "grad_norm": 0.6077118042914474, + "learning_rate": 4.530795052984104e-06, + "loss": 0.0528, + "step": 75 + }, + { + "epoch": 7.6, + "grad_norm": 0.7605226736684684, + "learning_rate": 4.397316598723385e-06, + "loss": 0.1128, + "step": 76 + }, + { + "epoch": 7.7, + "grad_norm": 0.618480612781212, + "learning_rate": 4.264272645841419e-06, + "loss": 0.0804, + "step": 77 + }, + { + "epoch": 7.8, + "grad_norm": 0.6591734340885593, + "learning_rate": 4.131759111665349e-06, + "loss": 0.1166, + "step": 78 + }, + { + "epoch": 7.9, + "grad_norm": 0.706476292562821, + "learning_rate": 3.999871531119779e-06, + "loss": 0.0954, + "step": 79 + }, + { + "epoch": 8.0, + "grad_norm": 0.6511060587514005, + "learning_rate": 3.86870498785139e-06, + "loss": 0.0996, + "step": 80 + }, + { + "epoch": 8.1, + "grad_norm": 0.5877512031587164, + "learning_rate": 3.7383540456788915e-06, + "loss": 0.0643, + "step": 81 + }, + { + "epoch": 8.2, + "grad_norm": 0.5458773628267196, + "learning_rate": 3.6089126804177373e-06, + "loss": 0.0923, + "step": 82 + }, + { + "epoch": 8.3, + "grad_norm": 0.5072513687549686, + "learning_rate": 3.480474212128766e-06, + "loss": 0.0444, + "step": 83 + }, + { + "epoch": 8.4, + "grad_norm": 0.6566938015455519, + "learning_rate": 3.3531312378396026e-06, + "loss": 0.0742, + "step": 84 + }, + { + "epoch": 8.5, + "grad_norm": 0.5801962223341335, + "learning_rate": 3.226975564787322e-06, + "loss": 0.0588, + "step": 85 + }, + { + "epoch": 8.6, + "grad_norm": 0.6460146970130528, + "learning_rate": 3.1020981442305187e-06, + "loss": 0.0505, + "step": 86 + }, + { + "epoch": 8.7, + "grad_norm": 0.5597931785794817, + "learning_rate": 2.978589005878476e-06, + "loss": 0.0595, + "step": 87 + }, + { + "epoch": 8.8, + "grad_norm": 0.736772650278511, + "learning_rate": 2.8565371929847286e-06, + "loss": 0.0828, + "step": 88 + }, + { + "epoch": 8.9, + "grad_norm": 0.6093005863730493, + "learning_rate": 2.736030698151815e-06, + "loss": 0.0546, + "step": 89 + }, + { + "epoch": 9.0, + "grad_norm": 0.5542138970933609, + "learning_rate": 2.6171563998934605e-06, + "loss": 0.0663, + "step": 90 + }, + { + "epoch": 9.1, + "grad_norm": 0.5234184327196851, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.0448, + "step": 91 + }, + { + "epoch": 9.2, + "grad_norm": 0.5305920522906186, + "learning_rate": 2.384645961752113e-06, + "loss": 0.0459, + "step": 92 + }, + { + "epoch": 9.3, + "grad_norm": 0.4197505273523392, + "learning_rate": 2.2711774490274767e-06, + "loss": 0.0434, + "step": 93 + }, + { + "epoch": 9.4, + "grad_norm": 0.3934591059055063, + "learning_rate": 2.159676266344222e-06, + "loss": 0.0515, + "step": 94 + }, + { + "epoch": 9.5, + "grad_norm": 0.43218959356908915, + "learning_rate": 2.050222799884387e-06, + "loss": 0.0382, + "step": 95 + }, + { + "epoch": 9.6, + "grad_norm": 0.4622104651280975, + "learning_rate": 1.942895959539939e-06, + "loss": 0.0463, + "step": 96 + }, + { + "epoch": 9.7, + "grad_norm": 0.5535724362191721, + "learning_rate": 1.8377731220231144e-06, + "loss": 0.0447, + "step": 97 + }, + { + "epoch": 9.8, + "grad_norm": 0.4640259982574777, + "learning_rate": 1.7349300750820758e-06, + "loss": 0.0517, + "step": 98 + }, + { + "epoch": 9.9, + "grad_norm": 0.5266934894357415, + "learning_rate": 1.6344409628621482e-06, + "loss": 0.0585, + "step": 99 + }, + { + "epoch": 10.0, + "grad_norm": 0.4676281163285699, + "learning_rate": 1.5363782324520033e-06, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 10.1, + "grad_norm": 0.35967933721052214, + "learning_rate": 1.4408125816532981e-06, + "loss": 0.0357, + "step": 101 + }, + { + "epoch": 10.2, + "grad_norm": 0.27447636992783964, + "learning_rate": 1.347812908011485e-06, + "loss": 0.0214, + "step": 102 + }, + { + "epoch": 10.3, + "grad_norm": 0.3014381976097687, + "learning_rate": 1.257446259144494e-06, + "loss": 0.0267, + "step": 103 + }, + { + "epoch": 10.4, + "grad_norm": 0.38294713899905675, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.0665, + "step": 104 + }, + { + "epoch": 10.5, + "grad_norm": 0.3684690274136007, + "learning_rate": 1.0848706879118893e-06, + "loss": 0.0258, + "step": 105 + }, + { + "epoch": 10.6, + "grad_norm": 0.31478908256541077, + "learning_rate": 1.0027861829824953e-06, + "loss": 0.0171, + "step": 106 + }, + { + "epoch": 10.7, + "grad_norm": 0.34994824968823507, + "learning_rate": 9.235834480022788e-07, + "loss": 0.0422, + "step": 107 + }, + { + "epoch": 10.8, + "grad_norm": 0.4015273337656147, + "learning_rate": 8.473195837599419e-07, + "loss": 0.0474, + "step": 108 + }, + { + "epoch": 10.9, + "grad_norm": 0.3512467702811788, + "learning_rate": 7.740495722810271e-07, + "loss": 0.0387, + "step": 109 + }, + { + "epoch": 11.0, + "grad_norm": 0.2819446830081912, + "learning_rate": 7.03826237188916e-07, + "loss": 0.0265, + "step": 110 + }, + { + "epoch": 11.1, + "grad_norm": 0.26544732968956203, + "learning_rate": 6.367002056219285e-07, + "loss": 0.0206, + "step": 111 + }, + { + "epoch": 11.2, + "grad_norm": 0.28547087305675023, + "learning_rate": 5.727198717339511e-07, + "loss": 0.0346, + "step": 112 + }, + { + "epoch": 11.3, + "grad_norm": 0.29019462565393106, + "learning_rate": 5.119313618049309e-07, + "loss": 0.0263, + "step": 113 + }, + { + "epoch": 11.4, + "grad_norm": 0.2587759144580898, + "learning_rate": 4.54378500986381e-07, + "loss": 0.0262, + "step": 114 + }, + { + "epoch": 11.5, + "grad_norm": 0.2964554848018806, + "learning_rate": 4.001027817058789e-07, + "loss": 0.045, + "step": 115 + }, + { + "epoch": 11.6, + "grad_norm": 0.286044305327069, + "learning_rate": 3.49143333753309e-07, + "loss": 0.0283, + "step": 116 + }, + { + "epoch": 11.7, + "grad_norm": 0.2675893029381175, + "learning_rate": 3.015368960704584e-07, + "loss": 0.0185, + "step": 117 + }, + { + "epoch": 11.8, + "grad_norm": 0.3300518406278238, + "learning_rate": 2.573177902642726e-07, + "loss": 0.0422, + "step": 118 + }, + { + "epoch": 11.9, + "grad_norm": 0.32684186571188295, + "learning_rate": 2.1651789586287442e-07, + "loss": 0.0444, + "step": 119 + }, + { + "epoch": 12.0, + "grad_norm": 0.23225636748688117, + "learning_rate": 1.7916662733218848e-07, + "loss": 0.0162, + "step": 120 + }, + { + "epoch": 12.1, + "grad_norm": 0.18555282856615898, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.0111, + "step": 121 + }, + { + "epoch": 12.2, + "grad_norm": 0.2272995406967833, + "learning_rate": 1.1491517499091498e-07, + "loss": 0.0211, + "step": 122 + }, + { + "epoch": 12.3, + "grad_norm": 0.3170863977174301, + "learning_rate": 8.80613129216762e-08, + "loss": 0.0489, + "step": 123 + }, + { + "epoch": 12.4, + "grad_norm": 0.2869870238374455, + "learning_rate": 6.474868681043578e-08, + "loss": 0.0338, + "step": 124 + }, + { + "epoch": 12.5, + "grad_norm": 0.19605392462041799, + "learning_rate": 4.499410377045765e-08, + "loss": 0.0136, + "step": 125 + }, + { + "epoch": 12.6, + "grad_norm": 0.24074331934174073, + "learning_rate": 2.8811805762860578e-08, + "loss": 0.0236, + "step": 126 + }, + { + "epoch": 12.7, + "grad_norm": 0.2555618190181957, + "learning_rate": 1.6213459328950355e-08, + "loss": 0.0334, + "step": 127 + }, + { + "epoch": 12.8, + "grad_norm": 0.3138210130616997, + "learning_rate": 7.2081471792911914e-09, + "loss": 0.0412, + "step": 128 + }, + { + "epoch": 12.9, + "grad_norm": 0.26393331869796904, + "learning_rate": 1.8023616455731253e-09, + "loss": 0.0253, + "step": 129 + }, + { + "epoch": 13.0, + "grad_norm": 0.24454986517667737, + "learning_rate": 0.0, + "loss": 0.0263, + "step": 130 + }, + { + "epoch": 13.0, + "step": 130, + "total_flos": 3.2858174247665664e+16, + "train_loss": 0.2939408891905959, + "train_runtime": 1300.2962, + "train_samples_per_second": 3.159, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1, + "max_steps": 130, + "num_input_tokens_seen": 0, + "num_train_epochs": 13, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.2858174247665664e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..dd0405a --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05f95f21c2b30bea8841f84a280b04f3d4c16d74fa1ee4753b3ddf2141cfbcb +size 7352 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..073a52a Binary files /dev/null and b/training_loss.png differ diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833