commit 031ff14420de3928f09ee5feea5652cc08386dbb Author: ModelHub XC Date: Wed Apr 29 12:28:53 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: israel/AfriqueQwen-14B-multiturn Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..eb4c97b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-34265/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..5975d18 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +library_name: transformers +license: other +base_model: McGill-NLP/AfriqueQwen-14B +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: AfriqueQwen-14B-multiturn + results: [] +--- + + + +# AfriqueQwen-14B-multiturn + +This model is a fine-tuned version of [McGill-NLP/AfriqueQwen-14B](https://huggingface.co/McGill-NLP/AfriqueQwen-14B) on the afri_multiturn dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 1 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 8 +- total_eval_batch_size: 32 +- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 0.1 +- num_epochs: 5.0 + +### Training results + + + +### Framework versions + +- Transformers 5.2.0 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.22.2 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..c38e8bb --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "total_flos": 1091733654863872.0, + "train_loss": 0.42485430567312915, + "train_runtime": 114160.2816, + "train_samples_per_second": 2.401, + "train_steps_per_second": 0.3 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..699ff8d --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..7026c3c --- /dev/null +++ b/config.json @@ -0,0 +1,75 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17408, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 40, + "model_type": "qwen3", + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.2.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..39331ca --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "transformers_version": "5.2.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..cd61518 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c00c3e05d9310da4dcdc3b1e919ba7c1a445da8eeff56950f0a74ca8b2306a96 +size 29536666272 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ea4d101 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,15 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..c38e8bb --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "total_flos": 1091733654863872.0, + "train_loss": 0.42485430567312915, + "train_runtime": 114160.2816, + "train_samples_per_second": 2.401, + "train_steps_per_second": 0.3 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..212f1e2 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,35 @@ +{"current_steps": 1000, "total_steps": 34265, "loss": 1.091951904296875, "lr": 2.915086081120514e-06, "epoch": 0.1459321415541773, "percentage": 2.92, "elapsed_time": "0:55:22", "remaining_time": "1 day, 6:42:13"} +{"current_steps": 2000, "total_steps": 34265, "loss": 0.9010311279296875, "lr": 5.833090166326233e-06, "epoch": 0.2918642831083546, "percentage": 5.84, "elapsed_time": "1:50:40", "remaining_time": "1 day, 5:45:32"} +{"current_steps": 3000, "total_steps": 34265, "loss": 0.8496372680664063, "lr": 8.751094251531953e-06, "epoch": 0.43779642466253194, "percentage": 8.76, "elapsed_time": "2:46:03", "remaining_time": "1 day, 4:50:33"} +{"current_steps": 4000, "total_steps": 34265, "loss": 0.8293939208984376, "lr": 9.991513345767592e-06, "epoch": 0.5837285662167092, "percentage": 11.67, "elapsed_time": "3:41:25", "remaining_time": "1 day, 3:55:21"} +{"current_steps": 5000, "total_steps": 34265, "loss": 0.8073157348632812, "lr": 9.936020028278053e-06, "epoch": 0.7296607077708865, "percentage": 14.59, "elapsed_time": "4:36:43", "remaining_time": "1 day, 2:59:41"} +{"current_steps": 6000, "total_steps": 34265, "loss": 0.7829708251953125, "lr": 9.829343371836088e-06, "epoch": 0.8755928493250639, "percentage": 17.51, "elapsed_time": "5:32:07", "remaining_time": "1 day, 2:04:34"} +{"current_steps": 7000, "total_steps": 34265, "loss": 0.7496401977539062, "lr": 9.672589544454328e-06, "epoch": 1.021452024808464, "percentage": 20.43, "elapsed_time": "6:27:24", "remaining_time": "1 day, 1:08:55"} +{"current_steps": 8000, "total_steps": 34265, "loss": 0.6121725463867187, "lr": 9.46738398205746e-06, "epoch": 1.1673841663626414, "percentage": 23.35, "elapsed_time": "7:22:45", "remaining_time": "1 day, 0:13:38"} +{"current_steps": 9000, "total_steps": 34265, "loss": 0.611597412109375, "lr": 9.215854533761766e-06, "epoch": 1.3133163079168186, "percentage": 26.27, "elapsed_time": "8:18:11", "remaining_time": "23:18:31"} +{"current_steps": 10000, "total_steps": 34265, "loss": 0.6123408203125, "lr": 8.920609397454381e-06, "epoch": 1.459248449470996, "percentage": 29.18, "elapsed_time": "9:13:31", "remaining_time": "22:23:07"} +{"current_steps": 11000, "total_steps": 34265, "loss": 0.609525390625, "lr": 8.584710074466158e-06, "epoch": 1.6051805910251733, "percentage": 32.1, "elapsed_time": "10:08:58", "remaining_time": "21:27:58"} +{"current_steps": 12000, "total_steps": 34265, "loss": 0.6133668823242188, "lr": 8.211639623780629e-06, "epoch": 1.7511127325793505, "percentage": 35.02, "elapsed_time": "11:04:20", "remaining_time": "20:32:37"} +{"current_steps": 13000, "total_steps": 34265, "loss": 0.615207763671875, "lr": 7.805266544962458e-06, "epoch": 1.897044874133528, "percentage": 37.94, "elapsed_time": "11:59:35", "remaining_time": "19:37:05"} +{"current_steps": 14000, "total_steps": 34265, "loss": 0.5413321533203125, "lr": 7.3698046643160645e-06, "epoch": 2.042904049616928, "percentage": 40.86, "elapsed_time": "12:54:57", "remaining_time": "18:41:45"} +{"current_steps": 15000, "total_steps": 34265, "loss": 0.38200238037109374, "lr": 6.909769440229038e-06, "epoch": 2.1888361911711054, "percentage": 43.78, "elapsed_time": "13:50:21", "remaining_time": "17:46:27"} +{"current_steps": 16000, "total_steps": 34265, "loss": 0.38497344970703123, "lr": 6.4299311407857035e-06, "epoch": 2.334768332725283, "percentage": 46.69, "elapsed_time": "14:45:41", "remaining_time": "16:51:03"} +{"current_steps": 17000, "total_steps": 34265, "loss": 0.3850032653808594, "lr": 5.935265379168761e-06, "epoch": 2.48070047427946, "percentage": 49.61, "elapsed_time": "15:40:54", "remaining_time": "15:55:34"} +{"current_steps": 18000, "total_steps": 34265, "loss": 0.38166015625, "lr": 5.430901519764892e-06, "epoch": 2.6266326158336373, "percentage": 52.53, "elapsed_time": "16:36:17", "remaining_time": "15:00:15"} +{"current_steps": 19000, "total_steps": 34265, "loss": 0.3862608642578125, "lr": 4.9220694899697185e-06, "epoch": 2.7725647573878147, "percentage": 55.45, "elapsed_time": "17:31:33", "remaining_time": "14:04:50"} +{"current_steps": 20000, "total_steps": 34265, "loss": 0.38137054443359375, "lr": 4.414045549219315e-06, "epoch": 2.918496898941992, "percentage": 58.37, "elapsed_time": "18:26:50", "remaining_time": "13:09:27"} +{"current_steps": 21000, "total_steps": 34265, "loss": 0.2938824462890625, "lr": 3.912097577588397e-06, "epoch": 3.064356074425392, "percentage": 61.29, "elapsed_time": "19:22:11", "remaining_time": "12:14:06"} +{"current_steps": 22000, "total_steps": 34265, "loss": 0.19125808715820314, "lr": 3.4214304512770823e-06, "epoch": 3.2102882159795696, "percentage": 64.21, "elapsed_time": "20:17:28", "remaining_time": "11:18:44"} +{"current_steps": 23000, "total_steps": 34265, "loss": 0.18847378540039061, "lr": 2.9471320714071095e-06, "epoch": 3.356220357533747, "percentage": 67.12, "elapsed_time": "21:12:42", "remaining_time": "10:23:20"} +{"current_steps": 24000, "total_steps": 34265, "loss": 0.19000143432617186, "lr": 2.4941206057740675e-06, "epoch": 3.502152499087924, "percentage": 70.04, "elapsed_time": "22:08:12", "remaining_time": "9:28:04"} +{"current_steps": 25000, "total_steps": 34265, "loss": 0.18782159423828124, "lr": 2.06709349062457e-06, "epoch": 3.6480846406421015, "percentage": 72.96, "elapsed_time": "23:03:30", "remaining_time": "8:32:43"} +{"current_steps": 26000, "total_steps": 34265, "loss": 0.18233416748046874, "lr": 1.6704787212769829e-06, "epoch": 3.7940167821962785, "percentage": 75.88, "elapsed_time": "23:58:48", "remaining_time": "7:37:22"} +{"current_steps": 27000, "total_steps": 34265, "loss": 0.17911607360839843, "lr": 1.3083889366705216e-06, "epoch": 3.939948923750456, "percentage": 78.8, "elapsed_time": "1 day, 0:54:08", "remaining_time": "6:42:01"} +{"current_steps": 28000, "total_steps": 34265, "loss": 0.12317549133300781, "lr": 9.845787739562829e-07, "epoch": 4.085808099233856, "percentage": 81.72, "elapsed_time": "1 day, 1:49:35", "remaining_time": "5:46:43"} +{"current_steps": 29000, "total_steps": 34265, "loss": 0.08010615539550782, "lr": 7.024059353355333e-07, "epoch": 4.231740240788033, "percentage": 84.63, "elapsed_time": "1 day, 2:45:03", "remaining_time": "4:51:24"} +{"current_steps": 30000, "total_steps": 34265, "loss": 0.08023818969726562, "lr": 4.64796370857008e-07, "epoch": 4.377672382342211, "percentage": 87.55, "elapsed_time": "1 day, 3:40:23", "remaining_time": "3:56:03"} +{"current_steps": 31000, "total_steps": 34265, "loss": 0.07947054290771484, "lr": 2.7421393820510846e-07, "epoch": 4.523604523896388, "percentage": 90.47, "elapsed_time": "1 day, 4:35:53", "remaining_time": "3:00:43"} +{"current_steps": 32000, "total_steps": 34265, "loss": 0.07735189056396484, "lr": 1.326348540874095e-07, "epoch": 4.669536665450566, "percentage": 93.39, "elapsed_time": "1 day, 5:31:22", "remaining_time": "2:05:22"} +{"current_steps": 33000, "total_steps": 34265, "loss": 0.07909833526611328, "lr": 4.152720214406214e-08, "epoch": 4.815468807004743, "percentage": 96.31, "elapsed_time": "1 day, 6:26:45", "remaining_time": "1:10:01"} +{"current_steps": 34000, "total_steps": 34265, "loss": 0.07906644439697266, "lr": 1.8357098688476238e-09, "epoch": 4.96140094855892, "percentage": 99.23, "elapsed_time": "1 day, 7:22:10", "remaining_time": "0:14:40"} +{"current_steps": 34265, "total_steps": 34265, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "1 day, 7:42:40", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..48df15c --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,281 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 34265, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1459321415541773, + "grad_norm": 2.2002170436333888, + "learning_rate": 2.915086081120514e-06, + "loss": 1.091951904296875, + "step": 1000 + }, + { + "epoch": 0.2918642831083546, + "grad_norm": 2.3899237660554205, + "learning_rate": 5.833090166326233e-06, + "loss": 0.9010311279296875, + "step": 2000 + }, + { + "epoch": 0.43779642466253194, + "grad_norm": 1.7207455491966865, + "learning_rate": 8.751094251531953e-06, + "loss": 0.8496372680664063, + "step": 3000 + }, + { + "epoch": 0.5837285662167092, + "grad_norm": 1.49997561253202, + "learning_rate": 9.991513345767592e-06, + "loss": 0.8293939208984376, + "step": 4000 + }, + { + "epoch": 0.7296607077708865, + "grad_norm": 1.43919873990257, + "learning_rate": 9.936020028278053e-06, + "loss": 0.8073157348632812, + "step": 5000 + }, + { + "epoch": 0.8755928493250639, + "grad_norm": 1.384519444393501, + "learning_rate": 9.829343371836088e-06, + "loss": 0.7829708251953125, + "step": 6000 + }, + { + "epoch": 1.021452024808464, + "grad_norm": 1.457416645249146, + "learning_rate": 9.672589544454328e-06, + "loss": 0.7496401977539062, + "step": 7000 + }, + { + "epoch": 1.1673841663626414, + "grad_norm": 1.5133027310610572, + "learning_rate": 9.46738398205746e-06, + "loss": 0.6121725463867187, + "step": 8000 + }, + { + "epoch": 1.3133163079168186, + "grad_norm": 1.3855612557611925, + "learning_rate": 9.215854533761766e-06, + "loss": 0.611597412109375, + "step": 9000 + }, + { + "epoch": 1.459248449470996, + "grad_norm": 1.5802706606106216, + "learning_rate": 8.920609397454381e-06, + "loss": 0.6123408203125, + "step": 10000 + }, + { + "epoch": 1.6051805910251733, + "grad_norm": 1.4685873474216664, + "learning_rate": 8.584710074466158e-06, + "loss": 0.609525390625, + "step": 11000 + }, + { + "epoch": 1.7511127325793505, + "grad_norm": 1.219859265372452, + "learning_rate": 8.211639623780629e-06, + "loss": 0.6133668823242188, + "step": 12000 + }, + { + "epoch": 1.897044874133528, + "grad_norm": 1.2535196330309815, + "learning_rate": 7.805266544962458e-06, + "loss": 0.615207763671875, + "step": 13000 + }, + { + "epoch": 2.042904049616928, + "grad_norm": 1.5681754211680612, + "learning_rate": 7.3698046643160645e-06, + "loss": 0.5413321533203125, + "step": 14000 + }, + { + "epoch": 2.1888361911711054, + "grad_norm": 1.2447962924993787, + "learning_rate": 6.909769440229038e-06, + "loss": 0.38200238037109374, + "step": 15000 + }, + { + "epoch": 2.334768332725283, + "grad_norm": 1.4067667074111327, + "learning_rate": 6.4299311407857035e-06, + "loss": 0.38497344970703123, + "step": 16000 + }, + { + "epoch": 2.48070047427946, + "grad_norm": 1.326059120500027, + "learning_rate": 5.935265379168761e-06, + "loss": 0.3850032653808594, + "step": 17000 + }, + { + "epoch": 2.6266326158336373, + "grad_norm": 1.6923974019115828, + "learning_rate": 5.430901519764892e-06, + "loss": 0.38166015625, + "step": 18000 + }, + { + "epoch": 2.7725647573878147, + "grad_norm": 1.5367782861850559, + "learning_rate": 4.9220694899697185e-06, + "loss": 0.3862608642578125, + "step": 19000 + }, + { + "epoch": 2.918496898941992, + "grad_norm": 1.3054321737811847, + "learning_rate": 4.414045549219315e-06, + "loss": 0.38137054443359375, + "step": 20000 + }, + { + "epoch": 3.064356074425392, + "grad_norm": 1.69275953279231, + "learning_rate": 3.912097577588397e-06, + "loss": 0.2938824462890625, + "step": 21000 + }, + { + "epoch": 3.2102882159795696, + "grad_norm": 1.5240170572509457, + "learning_rate": 3.4214304512770823e-06, + "loss": 0.19125808715820314, + "step": 22000 + }, + { + "epoch": 3.356220357533747, + "grad_norm": 1.5774125509908914, + "learning_rate": 2.9471320714071095e-06, + "loss": 0.18847378540039061, + "step": 23000 + }, + { + "epoch": 3.502152499087924, + "grad_norm": 1.8421166872943977, + "learning_rate": 2.4941206057740675e-06, + "loss": 0.19000143432617186, + "step": 24000 + }, + { + "epoch": 3.6480846406421015, + "grad_norm": 1.5694212047061644, + "learning_rate": 2.06709349062457e-06, + "loss": 0.18782159423828124, + "step": 25000 + }, + { + "epoch": 3.7940167821962785, + "grad_norm": 1.496800606594422, + "learning_rate": 1.6704787212769829e-06, + "loss": 0.18233416748046874, + "step": 26000 + }, + { + "epoch": 3.939948923750456, + "grad_norm": 1.984744582152809, + "learning_rate": 1.3083889366705216e-06, + "loss": 0.17911607360839843, + "step": 27000 + }, + { + "epoch": 4.085808099233856, + "grad_norm": 1.2567953809414274, + "learning_rate": 9.845787739562829e-07, + "loss": 0.12317549133300781, + "step": 28000 + }, + { + "epoch": 4.231740240788033, + "grad_norm": 1.091077915720754, + "learning_rate": 7.024059353355333e-07, + "loss": 0.08010615539550782, + "step": 29000 + }, + { + "epoch": 4.377672382342211, + "grad_norm": 1.2460581955478622, + "learning_rate": 4.64796370857008e-07, + "loss": 0.08023818969726562, + "step": 30000 + }, + { + "epoch": 4.523604523896388, + "grad_norm": 1.7839106451867877, + "learning_rate": 2.7421393820510846e-07, + "loss": 0.07947054290771484, + "step": 31000 + }, + { + "epoch": 4.669536665450566, + "grad_norm": 1.1265268481087043, + "learning_rate": 1.326348540874095e-07, + "loss": 0.07735189056396484, + "step": 32000 + }, + { + "epoch": 4.815468807004743, + "grad_norm": 1.0896372619502366, + "learning_rate": 4.152720214406214e-08, + "loss": 0.07909833526611328, + "step": 33000 + }, + { + "epoch": 4.96140094855892, + "grad_norm": 1.2750658260191035, + "learning_rate": 1.8357098688476238e-09, + "loss": 0.07906644439697266, + "step": 34000 + }, + { + "epoch": 5.0, + "step": 34265, + "total_flos": 1091733654863872.0, + "train_loss": 0.42485430567312915, + "train_runtime": 114160.2816, + "train_samples_per_second": 2.401, + "train_steps_per_second": 0.3 + } + ], + "logging_steps": 1000, + "max_steps": 34265, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1091733654863872.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..e3f36cf --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a4cc99de1a10fd6c19d45d4b68f0e6d1e0fb1309394ed1c123226c8b71056c8 +size 7377 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..360f84a Binary files /dev/null and b/training_loss.png differ