commit be2d6658b9b175b875d5a578d55a0f1d06435263 Author: ModelHub XC Date: Tue Jun 2 18:22:12 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: burtenshaw/Qwen3-4B-SFT-Codeforces Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..325f86a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,73 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text + +*.pb filter=lfs diff=lfs merge=lfs -text + + +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +training_args.bin filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text +model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text + +checkpoint-300/rng_state_6.pth filter=lfs diff=lfs merge=lfs -text +checkpoint-300/rng_state_3.pth filter=lfs diff=lfs merge=lfs -text +checkpoint-300/merges.txt filter=lfs diff=lfs merge=lfs -text +checkpoint-300/rng_state_0.pth filter=lfs diff=lfs merge=lfs -text +a3af16249f2531fd/extended|lcb:codegeneration|0/52eb0b8d3282c52f/GENERATIVE.parquet filter=lfs diff=lfs merge=lfs -text +a3af16249f2531fd/extended|ifeval|0/79263c3302c83f11/GENERATIVE.parquet filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer.pt filter=lfs diff=lfs merge=lfs -text +checkpoint-300/rng_state_4.pth filter=lfs diff=lfs merge=lfs -text +checkpoint-300/rng_state_1.pth filter=lfs diff=lfs merge=lfs -text +checkpoint-300/rng_state_7.pth filter=lfs diff=lfs merge=lfs -text +checkpoint-300/scheduler.pt filter=lfs diff=lfs merge=lfs -text +checkpoint-300/vocab.json filter=lfs diff=lfs merge=lfs -text +checkpoint-300/training_args.bin filter=lfs diff=lfs merge=lfs -text +checkpoint-300/model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text +checkpoint-300/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text +checkpoint-300/rng_state_2.pth filter=lfs diff=lfs merge=lfs -text +checkpoint-300/rng_state_5.pth filter=lfs diff=lfs merge=lfs -text +checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bdc6232 --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B +library_name: transformers +model_name: Qwen3-4B-SFT-Codeforces +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for Qwen3-4B-SFT-Codeforces + +This model is a fine-tuned version of [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="burtenshaw/Qwen3-4B-SFT-Codeforces", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.24.0.dev0 +- Transformers: 4.57.1 +- Pytorch: 2.9.0 +- Datasets: 4.3.0 +- Tokenizers: 0.22.1 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/a3af16249f2531fd/extended|ifeval|0/79263c3302c83f11/GENERATIVE.parquet b/a3af16249f2531fd/extended|ifeval|0/79263c3302c83f11/GENERATIVE.parquet new file mode 100644 index 0000000..b297ce1 --- /dev/null +++ b/a3af16249f2531fd/extended|ifeval|0/79263c3302c83f11/GENERATIVE.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11b708b580a3a9a2a56557b364ad178b9a75c3295523a411d785c4bd987ce56 +size 2606747 diff --git a/a3af16249f2531fd/extended|lcb:codegeneration|0/52eb0b8d3282c52f/GENERATIVE.parquet b/a3af16249f2531fd/extended|lcb:codegeneration|0/52eb0b8d3282c52f/GENERATIVE.parquet new file mode 100644 index 0000000..3452e5c --- /dev/null +++ b/a3af16249f2531fd/extended|lcb:codegeneration|0/52eb0b8d3282c52f/GENERATIVE.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:292fef4a64900ac0faa91a1b58c9420f84ebddd2b1b823324a15415090b10bb5 +size 9422307 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..b54f913 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-300/added_tokens.json b/checkpoint-300/added_tokens.json new file mode 100644 index 0000000..b54f913 --- /dev/null +++ b/checkpoint-300/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/checkpoint-300/chat_template.jinja b/checkpoint-300/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/checkpoint-300/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-300/config.json b/checkpoint-300/config.json new file mode 100644 index 0000000..d43df39 --- /dev/null +++ b/checkpoint-300/config.json @@ -0,0 +1,68 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-300/generation_config.json b/checkpoint-300/generation_config.json new file mode 100644 index 0000000..d121c2f --- /dev/null +++ b/checkpoint-300/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "4.57.1" +} diff --git a/checkpoint-300/merges.txt b/checkpoint-300/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/checkpoint-300/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/checkpoint-300/model-00001-of-00002.safetensors b/checkpoint-300/model-00001-of-00002.safetensors new file mode 100644 index 0000000..efd88ed --- /dev/null +++ b/checkpoint-300/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624c1f21c9c694664d860f87353399ebf0e8e424a43cf8afcb1fd3c29a85c322 +size 4967215360 diff --git a/checkpoint-300/model-00002-of-00002.safetensors b/checkpoint-300/model-00002-of-00002.safetensors new file mode 100644 index 0000000..f7c7212 --- /dev/null +++ b/checkpoint-300/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36b41f5e5976d7f93a4a00954fdb2a1622dd24d69582fe4cd7a7b31d8368da38 +size 3077766632 diff --git a/checkpoint-300/model.safetensors.index.json b/checkpoint-300/model.safetensors.index.json new file mode 100644 index 0000000..b65d806 --- /dev/null +++ b/checkpoint-300/model.safetensors.index.json @@ -0,0 +1,406 @@ +{ + "metadata": { + "total_parameters": 4022468096, + "total_size": 8044936192 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000..8ee1105 --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f72fb97a5797068d2d2d01dce276a018e2cb993446790dbdf67fea67b8a16825 +size 16090225449 diff --git a/checkpoint-300/rng_state_0.pth b/checkpoint-300/rng_state_0.pth new file mode 100644 index 0000000..4b0bea5 --- /dev/null +++ b/checkpoint-300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8f92cf63e0989759370d24108b469c492c12202403f036015307ce49f12cedc +size 16389 diff --git a/checkpoint-300/rng_state_1.pth b/checkpoint-300/rng_state_1.pth new file mode 100644 index 0000000..2c94d69 --- /dev/null +++ b/checkpoint-300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ed40a0a4e9f365d2c6cc004d97e6705894eba46c8be4c160c1455bc3062dee1 +size 16389 diff --git a/checkpoint-300/rng_state_2.pth b/checkpoint-300/rng_state_2.pth new file mode 100644 index 0000000..e3d2f61 --- /dev/null +++ b/checkpoint-300/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d688b304d19c260b5cfa471535ed51d7e1d60b3a0d0159dfd1a04b87904a9f42 +size 16389 diff --git a/checkpoint-300/rng_state_3.pth b/checkpoint-300/rng_state_3.pth new file mode 100644 index 0000000..84660ff --- /dev/null +++ b/checkpoint-300/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9967425ebcaee80d9b518fa0244d52f739b1b983d87cda71d5fede0c073e9d3b +size 16389 diff --git a/checkpoint-300/rng_state_4.pth b/checkpoint-300/rng_state_4.pth new file mode 100644 index 0000000..fd63d0f --- /dev/null +++ b/checkpoint-300/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:469900fd39c667ffbd49c3c407c0ba317a1e9f5f9339a99b5d38423b7d0ce6d4 +size 16389 diff --git a/checkpoint-300/rng_state_5.pth b/checkpoint-300/rng_state_5.pth new file mode 100644 index 0000000..a9f2a80 --- /dev/null +++ b/checkpoint-300/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124688471ff2a6e80f2fcefedbf741fb18d08dd539d5bd07a52e81be545142a5 +size 16389 diff --git a/checkpoint-300/rng_state_6.pth b/checkpoint-300/rng_state_6.pth new file mode 100644 index 0000000..ffdeaa9 --- /dev/null +++ b/checkpoint-300/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e69f1ced9f992a72c948698e5eb06088610788988cdb2fdbdd624e064319d60 +size 16389 diff --git a/checkpoint-300/rng_state_7.pth b/checkpoint-300/rng_state_7.pth new file mode 100644 index 0000000..ad30a51 --- /dev/null +++ b/checkpoint-300/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a376268a55d6ee10c371c06aa952334c4c6a1af9ea2d71b1951a57367a0c6722 +size 16389 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000..e37b515 --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab8a59e491dc0faa5038b05bd09b5712655903fd29b89bb8e94b1907702fe31a +size 1465 diff --git a/checkpoint-300/special_tokens_map.json b/checkpoint-300/special_tokens_map.json new file mode 100644 index 0000000..ac23c0a --- /dev/null +++ b/checkpoint-300/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-300/tokenizer.json b/checkpoint-300/tokenizer.json new file mode 100644 index 0000000..cd71f61 --- /dev/null +++ b/checkpoint-300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4 +size 11422654 diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000..ddaf698 --- /dev/null +++ b/checkpoint-300/tokenizer_config.json @@ -0,0 +1,239 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151666": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151667": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151668": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000..7dffc27 --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,2434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.23603461841070023, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007867820613690008, + "grad_norm": 6.0625, + "learning_rate": 0.0, + "loss": 0.7636, + "num_tokens": 381797.0, + "step": 1 + }, + { + "epoch": 0.0015735641227380016, + "grad_norm": 5.90625, + "learning_rate": 6.666666666666667e-07, + "loss": 0.7623, + "num_tokens": 837007.0, + "step": 2 + }, + { + "epoch": 0.0023603461841070024, + "grad_norm": 5.53125, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.7581, + "num_tokens": 1282591.0, + "step": 3 + }, + { + "epoch": 0.003147128245476003, + "grad_norm": 5.4375, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7678, + "num_tokens": 1736983.0, + "step": 4 + }, + { + "epoch": 0.003933910306845004, + "grad_norm": 5.21875, + "learning_rate": 2.666666666666667e-06, + "loss": 0.748, + "num_tokens": 2211811.0, + "step": 5 + }, + { + "epoch": 0.004720692368214005, + "grad_norm": 5.25, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.7548, + "num_tokens": 2608382.0, + "step": 6 + }, + { + "epoch": 0.0055074744295830055, + "grad_norm": 4.5625, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7358, + "num_tokens": 3064838.0, + "step": 7 + }, + { + "epoch": 0.006294256490952006, + "grad_norm": 3.96875, + "learning_rate": 4.666666666666667e-06, + "loss": 0.704, + "num_tokens": 3465837.0, + "step": 8 + }, + { + "epoch": 0.007081038552321007, + "grad_norm": 3.328125, + "learning_rate": 5.333333333333334e-06, + "loss": 0.6919, + "num_tokens": 3900673.0, + "step": 9 + }, + { + "epoch": 0.007867820613690008, + "grad_norm": 3.0, + "learning_rate": 6e-06, + "loss": 0.6964, + "num_tokens": 4322216.0, + "step": 10 + }, + { + "epoch": 0.00865460267505901, + "grad_norm": 2.59375, + "learning_rate": 6.666666666666667e-06, + "loss": 0.6865, + "num_tokens": 4763831.0, + "step": 11 + }, + { + "epoch": 0.00944138473642801, + "grad_norm": 2.125, + "learning_rate": 7.333333333333333e-06, + "loss": 0.6674, + "num_tokens": 5292618.0, + "step": 12 + }, + { + "epoch": 0.010228166797797011, + "grad_norm": 1.9375, + "learning_rate": 8.000000000000001e-06, + "loss": 0.6771, + "num_tokens": 5687036.0, + "step": 13 + }, + { + "epoch": 0.011014948859166011, + "grad_norm": 1.3828125, + "learning_rate": 8.666666666666668e-06, + "loss": 0.6447, + "num_tokens": 6116911.0, + "step": 14 + }, + { + "epoch": 0.011801730920535013, + "grad_norm": 1.03125, + "learning_rate": 9.333333333333334e-06, + "loss": 0.6377, + "num_tokens": 6516802.0, + "step": 15 + }, + { + "epoch": 0.012588512981904013, + "grad_norm": 0.7890625, + "learning_rate": 1e-05, + "loss": 0.635, + "num_tokens": 6954839.0, + "step": 16 + }, + { + "epoch": 0.013375295043273014, + "grad_norm": 0.703125, + "learning_rate": 9.999726606524545e-06, + "loss": 0.614, + "num_tokens": 7431706.0, + "step": 17 + }, + { + "epoch": 0.014162077104642014, + "grad_norm": 0.72265625, + "learning_rate": 9.998906459317727e-06, + "loss": 0.638, + "num_tokens": 7873139.0, + "step": 18 + }, + { + "epoch": 0.014948859166011016, + "grad_norm": 0.76171875, + "learning_rate": 9.997539658034168e-06, + "loss": 0.6381, + "num_tokens": 8336156.0, + "step": 19 + }, + { + "epoch": 0.015735641227380016, + "grad_norm": 0.671875, + "learning_rate": 9.995626368751447e-06, + "loss": 0.6165, + "num_tokens": 8809699.0, + "step": 20 + }, + { + "epoch": 0.016522423288749016, + "grad_norm": 0.64453125, + "learning_rate": 9.993166823949924e-06, + "loss": 0.5833, + "num_tokens": 9276482.0, + "step": 21 + }, + { + "epoch": 0.01730920535011802, + "grad_norm": 0.76171875, + "learning_rate": 9.990161322484486e-06, + "loss": 0.6285, + "num_tokens": 9761551.0, + "step": 22 + }, + { + "epoch": 0.01809598741148702, + "grad_norm": 0.609375, + "learning_rate": 9.986610229548242e-06, + "loss": 0.607, + "num_tokens": 10189001.0, + "step": 23 + }, + { + "epoch": 0.01888276947285602, + "grad_norm": 0.546875, + "learning_rate": 9.982513976628143e-06, + "loss": 0.6078, + "num_tokens": 10643150.0, + "step": 24 + }, + { + "epoch": 0.01966955153422502, + "grad_norm": 0.5546875, + "learning_rate": 9.977873061452551e-06, + "loss": 0.6149, + "num_tokens": 11078815.0, + "step": 25 + }, + { + "epoch": 0.020456333595594022, + "grad_norm": 0.466796875, + "learning_rate": 9.972688047930773e-06, + "loss": 0.6132, + "num_tokens": 11527719.0, + "step": 26 + }, + { + "epoch": 0.021243115656963022, + "grad_norm": 0.43359375, + "learning_rate": 9.966959566084523e-06, + "loss": 0.5965, + "num_tokens": 12019704.0, + "step": 27 + }, + { + "epoch": 0.022029897718332022, + "grad_norm": 0.421875, + "learning_rate": 9.960688311971389e-06, + "loss": 0.5971, + "num_tokens": 12463410.0, + "step": 28 + }, + { + "epoch": 0.022816679779701022, + "grad_norm": 0.396484375, + "learning_rate": 9.953875047600236e-06, + "loss": 0.5854, + "num_tokens": 12905537.0, + "step": 29 + }, + { + "epoch": 0.023603461841070025, + "grad_norm": 0.39453125, + "learning_rate": 9.946520600838634e-06, + "loss": 0.5803, + "num_tokens": 13354614.0, + "step": 30 + }, + { + "epoch": 0.024390243902439025, + "grad_norm": 0.39453125, + "learning_rate": 9.938625865312252e-06, + "loss": 0.5919, + "num_tokens": 13770618.0, + "step": 31 + }, + { + "epoch": 0.025177025963808025, + "grad_norm": 0.3828125, + "learning_rate": 9.930191800296282e-06, + "loss": 0.5905, + "num_tokens": 14215259.0, + "step": 32 + }, + { + "epoch": 0.025963808025177025, + "grad_norm": 0.3828125, + "learning_rate": 9.921219430598881e-06, + "loss": 0.5701, + "num_tokens": 14649720.0, + "step": 33 + }, + { + "epoch": 0.02675059008654603, + "grad_norm": 0.40234375, + "learning_rate": 9.911709846436643e-06, + "loss": 0.5838, + "num_tokens": 15030456.0, + "step": 34 + }, + { + "epoch": 0.02753737214791503, + "grad_norm": 0.373046875, + "learning_rate": 9.901664203302126e-06, + "loss": 0.593, + "num_tokens": 15502604.0, + "step": 35 + }, + { + "epoch": 0.02832415420928403, + "grad_norm": 0.380859375, + "learning_rate": 9.89108372182346e-06, + "loss": 0.581, + "num_tokens": 15967084.0, + "step": 36 + }, + { + "epoch": 0.029110936270653028, + "grad_norm": 0.37890625, + "learning_rate": 9.879969687616026e-06, + "loss": 0.599, + "num_tokens": 16427896.0, + "step": 37 + }, + { + "epoch": 0.02989771833202203, + "grad_norm": 0.361328125, + "learning_rate": 9.86832345112624e-06, + "loss": 0.5883, + "num_tokens": 16861474.0, + "step": 38 + }, + { + "epoch": 0.03068450039339103, + "grad_norm": 0.38671875, + "learning_rate": 9.856146427467469e-06, + "loss": 0.5826, + "num_tokens": 17331485.0, + "step": 39 + }, + { + "epoch": 0.03147128245476003, + "grad_norm": 0.341796875, + "learning_rate": 9.84344009624807e-06, + "loss": 0.5965, + "num_tokens": 17804516.0, + "step": 40 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 0.37109375, + "learning_rate": 9.830206001391627e-06, + "loss": 0.5905, + "num_tokens": 18227140.0, + "step": 41 + }, + { + "epoch": 0.03304484657749803, + "grad_norm": 0.375, + "learning_rate": 9.816445750949336e-06, + "loss": 0.5782, + "num_tokens": 18625691.0, + "step": 42 + }, + { + "epoch": 0.03383162863886703, + "grad_norm": 0.35546875, + "learning_rate": 9.80216101690461e-06, + "loss": 0.5623, + "num_tokens": 18997781.0, + "step": 43 + }, + { + "epoch": 0.03461841070023604, + "grad_norm": 0.3515625, + "learning_rate": 9.787353534969936e-06, + "loss": 0.6028, + "num_tokens": 19431031.0, + "step": 44 + }, + { + "epoch": 0.03540519276160504, + "grad_norm": 0.31640625, + "learning_rate": 9.77202510437596e-06, + "loss": 0.59, + "num_tokens": 19904203.0, + "step": 45 + }, + { + "epoch": 0.03619197482297404, + "grad_norm": 0.328125, + "learning_rate": 9.756177587652857e-06, + "loss": 0.5801, + "num_tokens": 20363517.0, + "step": 46 + }, + { + "epoch": 0.03697875688434304, + "grad_norm": 0.314453125, + "learning_rate": 9.739812910404045e-06, + "loss": 0.5792, + "num_tokens": 20827976.0, + "step": 47 + }, + { + "epoch": 0.03776553894571204, + "grad_norm": 0.322265625, + "learning_rate": 9.722933061072185e-06, + "loss": 0.5831, + "num_tokens": 21250892.0, + "step": 48 + }, + { + "epoch": 0.03855232100708104, + "grad_norm": 0.349609375, + "learning_rate": 9.705540090697576e-06, + "loss": 0.5789, + "num_tokens": 21650869.0, + "step": 49 + }, + { + "epoch": 0.03933910306845004, + "grad_norm": 0.3203125, + "learning_rate": 9.687636112668933e-06, + "loss": 0.5961, + "num_tokens": 22076209.0, + "step": 50 + }, + { + "epoch": 0.04012588512981904, + "grad_norm": 0.47265625, + "learning_rate": 9.669223302466609e-06, + "loss": 0.5895, + "num_tokens": 22470639.0, + "step": 51 + }, + { + "epoch": 0.040912667191188044, + "grad_norm": 0.314453125, + "learning_rate": 9.650303897398232e-06, + "loss": 0.5857, + "num_tokens": 22930578.0, + "step": 52 + }, + { + "epoch": 0.041699449252557044, + "grad_norm": 0.310546875, + "learning_rate": 9.630880196326874e-06, + "loss": 0.5821, + "num_tokens": 23369639.0, + "step": 53 + }, + { + "epoch": 0.042486231313926044, + "grad_norm": 0.328125, + "learning_rate": 9.610954559391704e-06, + "loss": 0.5868, + "num_tokens": 23836873.0, + "step": 54 + }, + { + "epoch": 0.043273013375295044, + "grad_norm": 0.302734375, + "learning_rate": 9.590529407721232e-06, + "loss": 0.5755, + "num_tokens": 24277558.0, + "step": 55 + }, + { + "epoch": 0.044059795436664044, + "grad_norm": 0.326171875, + "learning_rate": 9.5696072231391e-06, + "loss": 0.5816, + "num_tokens": 24662313.0, + "step": 56 + }, + { + "epoch": 0.044846577498033044, + "grad_norm": 0.333984375, + "learning_rate": 9.548190547862532e-06, + "loss": 0.5677, + "num_tokens": 25128896.0, + "step": 57 + }, + { + "epoch": 0.045633359559402044, + "grad_norm": 0.33984375, + "learning_rate": 9.526281984193437e-06, + "loss": 0.6084, + "num_tokens": 25535507.0, + "step": 58 + }, + { + "epoch": 0.046420141620771044, + "grad_norm": 0.310546875, + "learning_rate": 9.503884194202195e-06, + "loss": 0.5585, + "num_tokens": 25950294.0, + "step": 59 + }, + { + "epoch": 0.04720692368214005, + "grad_norm": 0.294921875, + "learning_rate": 9.480999899404207e-06, + "loss": 0.5812, + "num_tokens": 26394450.0, + "step": 60 + }, + { + "epoch": 0.04799370574350905, + "grad_norm": 0.29296875, + "learning_rate": 9.4576318804292e-06, + "loss": 0.5978, + "num_tokens": 26895403.0, + "step": 61 + }, + { + "epoch": 0.04878048780487805, + "grad_norm": 0.31640625, + "learning_rate": 9.433782976683366e-06, + "loss": 0.5836, + "num_tokens": 27334154.0, + "step": 62 + }, + { + "epoch": 0.04956726986624705, + "grad_norm": 0.326171875, + "learning_rate": 9.409456086004336e-06, + "loss": 0.5956, + "num_tokens": 27755608.0, + "step": 63 + }, + { + "epoch": 0.05035405192761605, + "grad_norm": 0.328125, + "learning_rate": 9.384654164309083e-06, + "loss": 0.5797, + "num_tokens": 28204981.0, + "step": 64 + }, + { + "epoch": 0.05114083398898505, + "grad_norm": 0.306640625, + "learning_rate": 9.359380225234752e-06, + "loss": 0.5252, + "num_tokens": 28587979.0, + "step": 65 + }, + { + "epoch": 0.05192761605035405, + "grad_norm": 0.314453125, + "learning_rate": 9.333637339772472e-06, + "loss": 0.5811, + "num_tokens": 28974623.0, + "step": 66 + }, + { + "epoch": 0.05271439811172305, + "grad_norm": 0.294921875, + "learning_rate": 9.30742863589421e-06, + "loss": 0.5713, + "num_tokens": 29397669.0, + "step": 67 + }, + { + "epoch": 0.05350118017309206, + "grad_norm": 0.30859375, + "learning_rate": 9.280757298172696e-06, + "loss": 0.5666, + "num_tokens": 29838270.0, + "step": 68 + }, + { + "epoch": 0.05428796223446106, + "grad_norm": 0.28515625, + "learning_rate": 9.253626567394466e-06, + "loss": 0.5659, + "num_tokens": 30321067.0, + "step": 69 + }, + { + "epoch": 0.05507474429583006, + "grad_norm": 0.318359375, + "learning_rate": 9.226039740166091e-06, + "loss": 0.5729, + "num_tokens": 30751915.0, + "step": 70 + }, + { + "epoch": 0.05586152635719906, + "grad_norm": 0.2734375, + "learning_rate": 9.198000168513604e-06, + "loss": 0.5932, + "num_tokens": 31282597.0, + "step": 71 + }, + { + "epoch": 0.05664830841856806, + "grad_norm": 0.291015625, + "learning_rate": 9.169511259475202e-06, + "loss": 0.5736, + "num_tokens": 31733067.0, + "step": 72 + }, + { + "epoch": 0.057435090479937057, + "grad_norm": 0.3046875, + "learning_rate": 9.140576474687263e-06, + "loss": 0.5712, + "num_tokens": 32130865.0, + "step": 73 + }, + { + "epoch": 0.058221872541306056, + "grad_norm": 0.328125, + "learning_rate": 9.111199329963735e-06, + "loss": 0.5755, + "num_tokens": 32548792.0, + "step": 74 + }, + { + "epoch": 0.059008654602675056, + "grad_norm": 0.296875, + "learning_rate": 9.081383394868924e-06, + "loss": 0.594, + "num_tokens": 32971975.0, + "step": 75 + }, + { + "epoch": 0.05979543666404406, + "grad_norm": 0.3203125, + "learning_rate": 9.051132292283772e-06, + "loss": 0.5819, + "num_tokens": 33372811.0, + "step": 76 + }, + { + "epoch": 0.06058221872541306, + "grad_norm": 0.322265625, + "learning_rate": 9.020449697965645e-06, + "loss": 0.5888, + "num_tokens": 33784941.0, + "step": 77 + }, + { + "epoch": 0.06136900078678206, + "grad_norm": 0.30859375, + "learning_rate": 8.989339340101698e-06, + "loss": 0.5837, + "num_tokens": 34224127.0, + "step": 78 + }, + { + "epoch": 0.06215578284815106, + "grad_norm": 0.29296875, + "learning_rate": 8.957804998855866e-06, + "loss": 0.5725, + "num_tokens": 34658551.0, + "step": 79 + }, + { + "epoch": 0.06294256490952006, + "grad_norm": 0.294921875, + "learning_rate": 8.92585050590955e-06, + "loss": 0.5749, + "num_tokens": 35094129.0, + "step": 80 + }, + { + "epoch": 0.06372934697088907, + "grad_norm": 0.328125, + "learning_rate": 8.893479743996034e-06, + "loss": 0.5657, + "num_tokens": 35436410.0, + "step": 81 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 0.322265625, + "learning_rate": 8.860696646428693e-06, + "loss": 0.5629, + "num_tokens": 35825073.0, + "step": 82 + }, + { + "epoch": 0.06530291109362707, + "grad_norm": 0.3203125, + "learning_rate": 8.827505196623074e-06, + "loss": 0.5661, + "num_tokens": 36223791.0, + "step": 83 + }, + { + "epoch": 0.06608969315499606, + "grad_norm": 0.294921875, + "learning_rate": 8.793909427612878e-06, + "loss": 0.5734, + "num_tokens": 36656241.0, + "step": 84 + }, + { + "epoch": 0.06687647521636507, + "grad_norm": 0.384765625, + "learning_rate": 8.759913421559902e-06, + "loss": 0.5675, + "num_tokens": 37077658.0, + "step": 85 + }, + { + "epoch": 0.06766325727773406, + "grad_norm": 0.298828125, + "learning_rate": 8.725521309258031e-06, + "loss": 0.5734, + "num_tokens": 37476774.0, + "step": 86 + }, + { + "epoch": 0.06845003933910307, + "grad_norm": 0.302734375, + "learning_rate": 8.690737269631318e-06, + "loss": 0.5708, + "num_tokens": 37871004.0, + "step": 87 + }, + { + "epoch": 0.06923682140047208, + "grad_norm": 0.287109375, + "learning_rate": 8.655565529226199e-06, + "loss": 0.5664, + "num_tokens": 38301817.0, + "step": 88 + }, + { + "epoch": 0.07002360346184107, + "grad_norm": 0.318359375, + "learning_rate": 8.62001036169794e-06, + "loss": 0.5746, + "num_tokens": 38681899.0, + "step": 89 + }, + { + "epoch": 0.07081038552321008, + "grad_norm": 0.302734375, + "learning_rate": 8.58407608729135e-06, + "loss": 0.5912, + "num_tokens": 39130089.0, + "step": 90 + }, + { + "epoch": 0.07159716758457907, + "grad_norm": 0.37890625, + "learning_rate": 8.547767072315835e-06, + "loss": 0.5813, + "num_tokens": 39591218.0, + "step": 91 + }, + { + "epoch": 0.07238394964594808, + "grad_norm": 0.302734375, + "learning_rate": 8.511087728614863e-06, + "loss": 0.574, + "num_tokens": 40032847.0, + "step": 92 + }, + { + "epoch": 0.07317073170731707, + "grad_norm": 0.294921875, + "learning_rate": 8.474042513029876e-06, + "loss": 0.5803, + "num_tokens": 40473954.0, + "step": 93 + }, + { + "epoch": 0.07395751376868608, + "grad_norm": 0.30078125, + "learning_rate": 8.43663592685876e-06, + "loss": 0.5779, + "num_tokens": 40900930.0, + "step": 94 + }, + { + "epoch": 0.07474429583005507, + "grad_norm": 0.27734375, + "learning_rate": 8.39887251530889e-06, + "loss": 0.5835, + "num_tokens": 41368577.0, + "step": 95 + }, + { + "epoch": 0.07553107789142408, + "grad_norm": 0.296875, + "learning_rate": 8.360756866944858e-06, + "loss": 0.5785, + "num_tokens": 41767993.0, + "step": 96 + }, + { + "epoch": 0.07631785995279308, + "grad_norm": 0.302734375, + "learning_rate": 8.322293613130917e-06, + "loss": 0.5932, + "num_tokens": 42197577.0, + "step": 97 + }, + { + "epoch": 0.07710464201416208, + "grad_norm": 0.298828125, + "learning_rate": 8.283487427468244e-06, + "loss": 0.5848, + "num_tokens": 42649936.0, + "step": 98 + }, + { + "epoch": 0.07789142407553108, + "grad_norm": 0.29296875, + "learning_rate": 8.244343025227041e-06, + "loss": 0.5812, + "num_tokens": 43084525.0, + "step": 99 + }, + { + "epoch": 0.07867820613690008, + "grad_norm": 0.2734375, + "learning_rate": 8.204865162773613e-06, + "loss": 0.5629, + "num_tokens": 43587988.0, + "step": 100 + }, + { + "epoch": 0.07946498819826908, + "grad_norm": 0.3984375, + "learning_rate": 8.165058636992411e-06, + "loss": 0.5931, + "num_tokens": 43968594.0, + "step": 101 + }, + { + "epoch": 0.08025177025963807, + "grad_norm": 0.376953125, + "learning_rate": 8.12492828470318e-06, + "loss": 0.5556, + "num_tokens": 44454046.0, + "step": 102 + }, + { + "epoch": 0.08103855232100708, + "grad_norm": 0.32421875, + "learning_rate": 8.084478982073247e-06, + "loss": 0.6005, + "num_tokens": 44928448.0, + "step": 103 + }, + { + "epoch": 0.08182533438237609, + "grad_norm": 0.283203125, + "learning_rate": 8.043715644025025e-06, + "loss": 0.5685, + "num_tokens": 45405277.0, + "step": 104 + }, + { + "epoch": 0.08261211644374508, + "grad_norm": 0.28125, + "learning_rate": 8.002643223638803e-06, + "loss": 0.5731, + "num_tokens": 45870226.0, + "step": 105 + }, + { + "epoch": 0.08339889850511409, + "grad_norm": 0.28515625, + "learning_rate": 7.961266711550922e-06, + "loss": 0.573, + "num_tokens": 46308121.0, + "step": 106 + }, + { + "epoch": 0.08418568056648308, + "grad_norm": 0.30078125, + "learning_rate": 7.919591135347354e-06, + "loss": 0.5727, + "num_tokens": 46727971.0, + "step": 107 + }, + { + "epoch": 0.08497246262785209, + "grad_norm": 0.283203125, + "learning_rate": 7.877621558952817e-06, + "loss": 0.5783, + "num_tokens": 47180249.0, + "step": 108 + }, + { + "epoch": 0.08575924468922108, + "grad_norm": 0.3671875, + "learning_rate": 7.83536308201547e-06, + "loss": 0.5669, + "num_tokens": 47709483.0, + "step": 109 + }, + { + "epoch": 0.08654602675059009, + "grad_norm": 0.28125, + "learning_rate": 7.792820839287257e-06, + "loss": 0.5861, + "num_tokens": 48152246.0, + "step": 110 + }, + { + "epoch": 0.08733280881195908, + "grad_norm": 0.296875, + "learning_rate": 7.75e-06, + "loss": 0.5473, + "num_tokens": 48563870.0, + "step": 111 + }, + { + "epoch": 0.08811959087332809, + "grad_norm": 0.30859375, + "learning_rate": 7.706905767237288e-06, + "loss": 0.5773, + "num_tokens": 49025023.0, + "step": 112 + }, + { + "epoch": 0.0889063729346971, + "grad_norm": 0.310546875, + "learning_rate": 7.663543377302257e-06, + "loss": 0.582, + "num_tokens": 49467162.0, + "step": 113 + }, + { + "epoch": 0.08969315499606609, + "grad_norm": 0.326171875, + "learning_rate": 7.6199180990813535e-06, + "loss": 0.5933, + "num_tokens": 49856067.0, + "step": 114 + }, + { + "epoch": 0.0904799370574351, + "grad_norm": 0.306640625, + "learning_rate": 7.576035233404097e-06, + "loss": 0.5623, + "num_tokens": 50334050.0, + "step": 115 + }, + { + "epoch": 0.09126671911880409, + "grad_norm": 0.2734375, + "learning_rate": 7.531900112399004e-06, + "loss": 0.5649, + "num_tokens": 50811750.0, + "step": 116 + }, + { + "epoch": 0.0920535011801731, + "grad_norm": 0.267578125, + "learning_rate": 7.487518098845684e-06, + "loss": 0.5709, + "num_tokens": 51318751.0, + "step": 117 + }, + { + "epoch": 0.09284028324154209, + "grad_norm": 0.306640625, + "learning_rate": 7.442894585523218e-06, + "loss": 0.5651, + "num_tokens": 51726918.0, + "step": 118 + }, + { + "epoch": 0.0936270653029111, + "grad_norm": 0.291015625, + "learning_rate": 7.398034994554895e-06, + "loss": 0.5844, + "num_tokens": 52184995.0, + "step": 119 + }, + { + "epoch": 0.0944138473642801, + "grad_norm": 0.28125, + "learning_rate": 7.352944776749374e-06, + "loss": 0.573, + "num_tokens": 52641746.0, + "step": 120 + }, + { + "epoch": 0.0952006294256491, + "grad_norm": 0.28515625, + "learning_rate": 7.307629410938364e-06, + "loss": 0.5648, + "num_tokens": 53083720.0, + "step": 121 + }, + { + "epoch": 0.0959874114870181, + "grad_norm": 0.5390625, + "learning_rate": 7.262094403310912e-06, + "loss": 0.5709, + "num_tokens": 53509360.0, + "step": 122 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 0.2734375, + "learning_rate": 7.216345286744349e-06, + "loss": 0.5748, + "num_tokens": 53994487.0, + "step": 123 + }, + { + "epoch": 0.0975609756097561, + "grad_norm": 0.34375, + "learning_rate": 7.1703876201319935e-06, + "loss": 0.5866, + "num_tokens": 54432027.0, + "step": 124 + }, + { + "epoch": 0.0983477576711251, + "grad_norm": 0.294921875, + "learning_rate": 7.124226987707717e-06, + "loss": 0.6052, + "num_tokens": 54867083.0, + "step": 125 + }, + { + "epoch": 0.0991345397324941, + "grad_norm": 0.455078125, + "learning_rate": 7.0778689983673955e-06, + "loss": 0.5641, + "num_tokens": 55248221.0, + "step": 126 + }, + { + "epoch": 0.0999213217938631, + "grad_norm": 0.27734375, + "learning_rate": 7.031319284987395e-06, + "loss": 0.5822, + "num_tokens": 55746694.0, + "step": 127 + }, + { + "epoch": 0.1007081038552321, + "grad_norm": 0.267578125, + "learning_rate": 6.984583503740123e-06, + "loss": 0.5488, + "num_tokens": 56213653.0, + "step": 128 + }, + { + "epoch": 0.10149488591660111, + "grad_norm": 0.29296875, + "learning_rate": 6.937667333406767e-06, + "loss": 0.5643, + "num_tokens": 56638790.0, + "step": 129 + }, + { + "epoch": 0.1022816679779701, + "grad_norm": 0.27734375, + "learning_rate": 6.890576474687264e-06, + "loss": 0.5563, + "num_tokens": 57062435.0, + "step": 130 + }, + { + "epoch": 0.10306845003933911, + "grad_norm": 0.3359375, + "learning_rate": 6.843316649507627e-06, + "loss": 0.5764, + "num_tokens": 57474963.0, + "step": 131 + }, + { + "epoch": 0.1038552321007081, + "grad_norm": 0.28125, + "learning_rate": 6.795893600324678e-06, + "loss": 0.5544, + "num_tokens": 57881417.0, + "step": 132 + }, + { + "epoch": 0.10464201416207711, + "grad_norm": 0.314453125, + "learning_rate": 6.748313089428301e-06, + "loss": 0.5557, + "num_tokens": 58314408.0, + "step": 133 + }, + { + "epoch": 0.1054287962234461, + "grad_norm": 0.3359375, + "learning_rate": 6.700580898241268e-06, + "loss": 0.5747, + "num_tokens": 58739816.0, + "step": 134 + }, + { + "epoch": 0.10621557828481511, + "grad_norm": 0.33984375, + "learning_rate": 6.6527028266167515e-06, + "loss": 0.5608, + "num_tokens": 59159651.0, + "step": 135 + }, + { + "epoch": 0.10700236034618411, + "grad_norm": 0.271484375, + "learning_rate": 6.604684692133597e-06, + "loss": 0.5736, + "num_tokens": 59662731.0, + "step": 136 + }, + { + "epoch": 0.1077891424075531, + "grad_norm": 0.314453125, + "learning_rate": 6.556532329389435e-06, + "loss": 0.5666, + "num_tokens": 60042452.0, + "step": 137 + }, + { + "epoch": 0.10857592446892211, + "grad_norm": 0.275390625, + "learning_rate": 6.508251589291732e-06, + "loss": 0.5424, + "num_tokens": 60493087.0, + "step": 138 + }, + { + "epoch": 0.1093627065302911, + "grad_norm": 0.29296875, + "learning_rate": 6.459848338346861e-06, + "loss": 0.5746, + "num_tokens": 60931029.0, + "step": 139 + }, + { + "epoch": 0.11014948859166011, + "grad_norm": 0.296875, + "learning_rate": 6.411328457947264e-06, + "loss": 0.5777, + "num_tokens": 61366704.0, + "step": 140 + }, + { + "epoch": 0.1109362706530291, + "grad_norm": 0.3046875, + "learning_rate": 6.362697843656823e-06, + "loss": 0.5656, + "num_tokens": 61764432.0, + "step": 141 + }, + { + "epoch": 0.11172305271439811, + "grad_norm": 0.28125, + "learning_rate": 6.313962404494496e-06, + "loss": 0.5826, + "num_tokens": 62217966.0, + "step": 142 + }, + { + "epoch": 0.1125098347757671, + "grad_norm": 0.310546875, + "learning_rate": 6.265128062216319e-06, + "loss": 0.5523, + "num_tokens": 62650059.0, + "step": 143 + }, + { + "epoch": 0.11329661683713611, + "grad_norm": 0.263671875, + "learning_rate": 6.216200750595878e-06, + "loss": 0.5631, + "num_tokens": 63177970.0, + "step": 144 + }, + { + "epoch": 0.11408339889850512, + "grad_norm": 0.310546875, + "learning_rate": 6.167186414703289e-06, + "loss": 0.5898, + "num_tokens": 63563979.0, + "step": 145 + }, + { + "epoch": 0.11487018095987411, + "grad_norm": 0.28515625, + "learning_rate": 6.118091010182837e-06, + "loss": 0.5582, + "num_tokens": 64029187.0, + "step": 146 + }, + { + "epoch": 0.11565696302124312, + "grad_norm": 0.291015625, + "learning_rate": 6.068920502529309e-06, + "loss": 0.5884, + "num_tokens": 64506763.0, + "step": 147 + }, + { + "epoch": 0.11644374508261211, + "grad_norm": 0.275390625, + "learning_rate": 6.019680866363139e-06, + "loss": 0.5653, + "num_tokens": 64953757.0, + "step": 148 + }, + { + "epoch": 0.11723052714398112, + "grad_norm": 0.298828125, + "learning_rate": 5.970378084704441e-06, + "loss": 0.5931, + "num_tokens": 65389299.0, + "step": 149 + }, + { + "epoch": 0.11801730920535011, + "grad_norm": 0.337890625, + "learning_rate": 5.921018148246031e-06, + "loss": 0.5773, + "num_tokens": 65813399.0, + "step": 150 + }, + { + "epoch": 0.11880409126671912, + "grad_norm": 0.30078125, + "learning_rate": 5.871607054625497e-06, + "loss": 0.5549, + "num_tokens": 66215206.0, + "step": 151 + }, + { + "epoch": 0.11959087332808813, + "grad_norm": 0.2890625, + "learning_rate": 5.822150807696443e-06, + "loss": 0.5653, + "num_tokens": 66640758.0, + "step": 152 + }, + { + "epoch": 0.12037765538945712, + "grad_norm": 0.306640625, + "learning_rate": 5.772655416798972e-06, + "loss": 0.5904, + "num_tokens": 67055455.0, + "step": 153 + }, + { + "epoch": 0.12116443745082613, + "grad_norm": 0.296875, + "learning_rate": 5.723126896029501e-06, + "loss": 0.5743, + "num_tokens": 67483884.0, + "step": 154 + }, + { + "epoch": 0.12195121951219512, + "grad_norm": 0.28125, + "learning_rate": 5.6735712635099975e-06, + "loss": 0.568, + "num_tokens": 67926609.0, + "step": 155 + }, + { + "epoch": 0.12273800157356413, + "grad_norm": 0.2734375, + "learning_rate": 5.62399454065673e-06, + "loss": 0.5768, + "num_tokens": 68392157.0, + "step": 156 + }, + { + "epoch": 0.12352478363493312, + "grad_norm": 0.287109375, + "learning_rate": 5.574402751448614e-06, + "loss": 0.5595, + "num_tokens": 68834479.0, + "step": 157 + }, + { + "epoch": 0.12431156569630213, + "grad_norm": 0.28515625, + "learning_rate": 5.524801921695253e-06, + "loss": 0.5762, + "num_tokens": 69263714.0, + "step": 158 + }, + { + "epoch": 0.12509834775767112, + "grad_norm": 0.275390625, + "learning_rate": 5.475198078304749e-06, + "loss": 0.5703, + "num_tokens": 69766869.0, + "step": 159 + }, + { + "epoch": 0.12588512981904013, + "grad_norm": 0.5703125, + "learning_rate": 5.4255972485513875e-06, + "loss": 0.5676, + "num_tokens": 70234072.0, + "step": 160 + }, + { + "epoch": 0.12667191188040913, + "grad_norm": 0.33984375, + "learning_rate": 5.376005459343272e-06, + "loss": 0.5646, + "num_tokens": 70568089.0, + "step": 161 + }, + { + "epoch": 0.12745869394177814, + "grad_norm": 0.29296875, + "learning_rate": 5.326428736490002e-06, + "loss": 0.5526, + "num_tokens": 70975207.0, + "step": 162 + }, + { + "epoch": 0.12824547600314712, + "grad_norm": 0.265625, + "learning_rate": 5.2768731039705005e-06, + "loss": 0.5821, + "num_tokens": 71474447.0, + "step": 163 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.48046875, + "learning_rate": 5.227344583201031e-06, + "loss": 0.5662, + "num_tokens": 71959059.0, + "step": 164 + }, + { + "epoch": 0.12981904012588513, + "grad_norm": 0.306640625, + "learning_rate": 5.17784919230356e-06, + "loss": 0.5653, + "num_tokens": 72364485.0, + "step": 165 + }, + { + "epoch": 0.13060582218725414, + "grad_norm": 0.30859375, + "learning_rate": 5.1283929453745055e-06, + "loss": 0.569, + "num_tokens": 72743904.0, + "step": 166 + }, + { + "epoch": 0.13139260424862312, + "grad_norm": 0.2734375, + "learning_rate": 5.0789818517539715e-06, + "loss": 0.5833, + "num_tokens": 73214490.0, + "step": 167 + }, + { + "epoch": 0.13217938630999213, + "grad_norm": 0.30859375, + "learning_rate": 5.02962191529556e-06, + "loss": 0.5677, + "num_tokens": 73626086.0, + "step": 168 + }, + { + "epoch": 0.13296616837136113, + "grad_norm": 0.318359375, + "learning_rate": 4.980319133636863e-06, + "loss": 0.5839, + "num_tokens": 74019455.0, + "step": 169 + }, + { + "epoch": 0.13375295043273014, + "grad_norm": 0.29296875, + "learning_rate": 4.9310794974706926e-06, + "loss": 0.583, + "num_tokens": 74481873.0, + "step": 170 + }, + { + "epoch": 0.13453973249409915, + "grad_norm": 0.298828125, + "learning_rate": 4.881908989817163e-06, + "loss": 0.5567, + "num_tokens": 74879232.0, + "step": 171 + }, + { + "epoch": 0.13532651455546812, + "grad_norm": 0.298828125, + "learning_rate": 4.832813585296711e-06, + "loss": 0.5718, + "num_tokens": 75325621.0, + "step": 172 + }, + { + "epoch": 0.13611329661683713, + "grad_norm": 0.30078125, + "learning_rate": 4.783799249404123e-06, + "loss": 0.5743, + "num_tokens": 75719223.0, + "step": 173 + }, + { + "epoch": 0.13690007867820614, + "grad_norm": 0.314453125, + "learning_rate": 4.734871937783683e-06, + "loss": 0.5742, + "num_tokens": 76143196.0, + "step": 174 + }, + { + "epoch": 0.13768686073957515, + "grad_norm": 0.291015625, + "learning_rate": 4.686037595505507e-06, + "loss": 0.588, + "num_tokens": 76585451.0, + "step": 175 + }, + { + "epoch": 0.13847364280094415, + "grad_norm": 0.279296875, + "learning_rate": 4.637302156343178e-06, + "loss": 0.5555, + "num_tokens": 77054043.0, + "step": 176 + }, + { + "epoch": 0.13926042486231313, + "grad_norm": 0.291015625, + "learning_rate": 4.588671542052737e-06, + "loss": 0.5721, + "num_tokens": 77503285.0, + "step": 177 + }, + { + "epoch": 0.14004720692368214, + "grad_norm": 0.27734375, + "learning_rate": 4.54015166165314e-06, + "loss": 0.5688, + "num_tokens": 77954347.0, + "step": 178 + }, + { + "epoch": 0.14083398898505115, + "grad_norm": 0.306640625, + "learning_rate": 4.491748410708268e-06, + "loss": 0.5803, + "num_tokens": 78357424.0, + "step": 179 + }, + { + "epoch": 0.14162077104642015, + "grad_norm": 0.302734375, + "learning_rate": 4.4434676706105665e-06, + "loss": 0.5747, + "num_tokens": 78784439.0, + "step": 180 + }, + { + "epoch": 0.14240755310778913, + "grad_norm": 0.3046875, + "learning_rate": 4.395315307866404e-06, + "loss": 0.564, + "num_tokens": 79210285.0, + "step": 181 + }, + { + "epoch": 0.14319433516915814, + "grad_norm": 0.283203125, + "learning_rate": 4.347297173383248e-06, + "loss": 0.5558, + "num_tokens": 79630366.0, + "step": 182 + }, + { + "epoch": 0.14398111723052714, + "grad_norm": 0.306640625, + "learning_rate": 4.299419101758733e-06, + "loss": 0.5728, + "num_tokens": 80044690.0, + "step": 183 + }, + { + "epoch": 0.14476789929189615, + "grad_norm": 0.265625, + "learning_rate": 4.2516869105717e-06, + "loss": 0.5898, + "num_tokens": 80557643.0, + "step": 184 + }, + { + "epoch": 0.14555468135326516, + "grad_norm": 0.30859375, + "learning_rate": 4.204106399675324e-06, + "loss": 0.5807, + "num_tokens": 80973066.0, + "step": 185 + }, + { + "epoch": 0.14634146341463414, + "grad_norm": 0.2578125, + "learning_rate": 4.156683350492376e-06, + "loss": 0.5606, + "num_tokens": 81510795.0, + "step": 186 + }, + { + "epoch": 0.14712824547600314, + "grad_norm": 0.296875, + "learning_rate": 4.109423525312738e-06, + "loss": 0.5899, + "num_tokens": 81922256.0, + "step": 187 + }, + { + "epoch": 0.14791502753737215, + "grad_norm": 0.28515625, + "learning_rate": 4.062332666593234e-06, + "loss": 0.5633, + "num_tokens": 82388653.0, + "step": 188 + }, + { + "epoch": 0.14870180959874116, + "grad_norm": 0.291015625, + "learning_rate": 4.015416496259878e-06, + "loss": 0.559, + "num_tokens": 82787043.0, + "step": 189 + }, + { + "epoch": 0.14948859166011014, + "grad_norm": 0.271484375, + "learning_rate": 3.968680715012606e-06, + "loss": 0.5568, + "num_tokens": 83289042.0, + "step": 190 + }, + { + "epoch": 0.15027537372147914, + "grad_norm": 0.314453125, + "learning_rate": 3.922131001632607e-06, + "loss": 0.5856, + "num_tokens": 83677670.0, + "step": 191 + }, + { + "epoch": 0.15106215578284815, + "grad_norm": 0.341796875, + "learning_rate": 3.875773012292286e-06, + "loss": 0.5845, + "num_tokens": 84134629.0, + "step": 192 + }, + { + "epoch": 0.15184893784421716, + "grad_norm": 0.263671875, + "learning_rate": 3.829612379868006e-06, + "loss": 0.5485, + "num_tokens": 84652172.0, + "step": 193 + }, + { + "epoch": 0.15263571990558616, + "grad_norm": 0.2734375, + "learning_rate": 3.7836547132556534e-06, + "loss": 0.5595, + "num_tokens": 85135514.0, + "step": 194 + }, + { + "epoch": 0.15342250196695514, + "grad_norm": 0.294921875, + "learning_rate": 3.73790559668909e-06, + "loss": 0.5513, + "num_tokens": 85538238.0, + "step": 195 + }, + { + "epoch": 0.15420928402832415, + "grad_norm": 0.294921875, + "learning_rate": 3.692370589061639e-06, + "loss": 0.5643, + "num_tokens": 85964644.0, + "step": 196 + }, + { + "epoch": 0.15499606608969316, + "grad_norm": 0.29296875, + "learning_rate": 3.6470552232506282e-06, + "loss": 0.5812, + "num_tokens": 86406088.0, + "step": 197 + }, + { + "epoch": 0.15578284815106216, + "grad_norm": 0.310546875, + "learning_rate": 3.601965005445106e-06, + "loss": 0.58, + "num_tokens": 86795385.0, + "step": 198 + }, + { + "epoch": 0.15656963021243114, + "grad_norm": 0.2734375, + "learning_rate": 3.5571054144767823e-06, + "loss": 0.5695, + "num_tokens": 87271627.0, + "step": 199 + }, + { + "epoch": 0.15735641227380015, + "grad_norm": 0.3359375, + "learning_rate": 3.5124819011543177e-06, + "loss": 0.5785, + "num_tokens": 87610861.0, + "step": 200 + }, + { + "epoch": 0.15814319433516916, + "grad_norm": 0.2890625, + "learning_rate": 3.468099887600999e-06, + "loss": 0.5551, + "num_tokens": 88074052.0, + "step": 201 + }, + { + "epoch": 0.15892997639653816, + "grad_norm": 0.2890625, + "learning_rate": 3.423964766595906e-06, + "loss": 0.5936, + "num_tokens": 88524495.0, + "step": 202 + }, + { + "epoch": 0.15971675845790717, + "grad_norm": 0.275390625, + "learning_rate": 3.380081900918648e-06, + "loss": 0.5787, + "num_tokens": 89019306.0, + "step": 203 + }, + { + "epoch": 0.16050354051927615, + "grad_norm": 0.314453125, + "learning_rate": 3.3364566226977414e-06, + "loss": 0.563, + "num_tokens": 89502903.0, + "step": 204 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 0.6328125, + "learning_rate": 3.293094232762715e-06, + "loss": 0.5732, + "num_tokens": 89911337.0, + "step": 205 + }, + { + "epoch": 0.16207710464201416, + "grad_norm": 0.271484375, + "learning_rate": 3.2500000000000015e-06, + "loss": 0.5682, + "num_tokens": 90392411.0, + "step": 206 + }, + { + "epoch": 0.16286388670338317, + "grad_norm": 0.30859375, + "learning_rate": 3.207179160712744e-06, + "loss": 0.5702, + "num_tokens": 90806043.0, + "step": 207 + }, + { + "epoch": 0.16365066876475218, + "grad_norm": 0.31640625, + "learning_rate": 3.1646369179845336e-06, + "loss": 0.5928, + "num_tokens": 91216707.0, + "step": 208 + }, + { + "epoch": 0.16443745082612116, + "grad_norm": 0.283203125, + "learning_rate": 3.1223784410471857e-06, + "loss": 0.5525, + "num_tokens": 91655883.0, + "step": 209 + }, + { + "epoch": 0.16522423288749016, + "grad_norm": 0.34375, + "learning_rate": 3.0804088646526488e-06, + "loss": 0.5697, + "num_tokens": 92157394.0, + "step": 210 + }, + { + "epoch": 0.16601101494885917, + "grad_norm": 0.283203125, + "learning_rate": 3.0387332884490806e-06, + "loss": 0.5829, + "num_tokens": 92616352.0, + "step": 211 + }, + { + "epoch": 0.16679779701022818, + "grad_norm": 0.30078125, + "learning_rate": 2.9973567763611975e-06, + "loss": 0.5913, + "num_tokens": 93057214.0, + "step": 212 + }, + { + "epoch": 0.16758457907159716, + "grad_norm": 0.3359375, + "learning_rate": 2.9562843559749765e-06, + "loss": 0.5842, + "num_tokens": 93505292.0, + "step": 213 + }, + { + "epoch": 0.16837136113296616, + "grad_norm": 0.296875, + "learning_rate": 2.9155210179267546e-06, + "loss": 0.5798, + "num_tokens": 93947491.0, + "step": 214 + }, + { + "epoch": 0.16915814319433517, + "grad_norm": 0.291015625, + "learning_rate": 2.8750717152968226e-06, + "loss": 0.5765, + "num_tokens": 94385854.0, + "step": 215 + }, + { + "epoch": 0.16994492525570418, + "grad_norm": 0.279296875, + "learning_rate": 2.8349413630075907e-06, + "loss": 0.5708, + "num_tokens": 94879232.0, + "step": 216 + }, + { + "epoch": 0.17073170731707318, + "grad_norm": 0.283203125, + "learning_rate": 2.7951348372263875e-06, + "loss": 0.5776, + "num_tokens": 95323846.0, + "step": 217 + }, + { + "epoch": 0.17151848937844216, + "grad_norm": 0.2734375, + "learning_rate": 2.75565697477296e-06, + "loss": 0.5687, + "num_tokens": 95785832.0, + "step": 218 + }, + { + "epoch": 0.17230527143981117, + "grad_norm": 0.29296875, + "learning_rate": 2.716512572531759e-06, + "loss": 0.5851, + "num_tokens": 96209254.0, + "step": 219 + }, + { + "epoch": 0.17309205350118018, + "grad_norm": 0.294921875, + "learning_rate": 2.677706386869083e-06, + "loss": 0.5794, + "num_tokens": 96626894.0, + "step": 220 + }, + { + "epoch": 0.17387883556254918, + "grad_norm": 0.302734375, + "learning_rate": 2.639243133055145e-06, + "loss": 0.5668, + "num_tokens": 97033416.0, + "step": 221 + }, + { + "epoch": 0.17466561762391816, + "grad_norm": 0.2890625, + "learning_rate": 2.6011274846911117e-06, + "loss": 0.5687, + "num_tokens": 97471843.0, + "step": 222 + }, + { + "epoch": 0.17545239968528717, + "grad_norm": 0.271484375, + "learning_rate": 2.5633640731412414e-06, + "loss": 0.5308, + "num_tokens": 97935067.0, + "step": 223 + }, + { + "epoch": 0.17623918174665618, + "grad_norm": 0.275390625, + "learning_rate": 2.5259574869701252e-06, + "loss": 0.5587, + "num_tokens": 98406708.0, + "step": 224 + }, + { + "epoch": 0.17702596380802518, + "grad_norm": 0.2890625, + "learning_rate": 2.4889122713851397e-06, + "loss": 0.5912, + "num_tokens": 98895234.0, + "step": 225 + }, + { + "epoch": 0.1778127458693942, + "grad_norm": 0.298828125, + "learning_rate": 2.4522329276841664e-06, + "loss": 0.5614, + "num_tokens": 99347577.0, + "step": 226 + }, + { + "epoch": 0.17859952793076317, + "grad_norm": 0.265625, + "learning_rate": 2.415923912708652e-06, + "loss": 0.5617, + "num_tokens": 99798396.0, + "step": 227 + }, + { + "epoch": 0.17938630999213218, + "grad_norm": 0.291015625, + "learning_rate": 2.379989638302062e-06, + "loss": 0.5726, + "num_tokens": 100234238.0, + "step": 228 + }, + { + "epoch": 0.18017309205350118, + "grad_norm": 0.318359375, + "learning_rate": 2.3444344707738017e-06, + "loss": 0.5679, + "num_tokens": 100582182.0, + "step": 229 + }, + { + "epoch": 0.1809598741148702, + "grad_norm": 0.310546875, + "learning_rate": 2.3092627303686827e-06, + "loss": 0.5707, + "num_tokens": 101009465.0, + "step": 230 + }, + { + "epoch": 0.18174665617623917, + "grad_norm": 0.30859375, + "learning_rate": 2.2744786907419704e-06, + "loss": 0.5719, + "num_tokens": 101390878.0, + "step": 231 + }, + { + "epoch": 0.18253343823760818, + "grad_norm": 0.29296875, + "learning_rate": 2.2400865784401e-06, + "loss": 0.579, + "num_tokens": 101822510.0, + "step": 232 + }, + { + "epoch": 0.18332022029897718, + "grad_norm": 0.306640625, + "learning_rate": 2.2060905723871225e-06, + "loss": 0.5608, + "num_tokens": 102238479.0, + "step": 233 + }, + { + "epoch": 0.1841070023603462, + "grad_norm": 0.275390625, + "learning_rate": 2.1724948033769257e-06, + "loss": 0.5843, + "num_tokens": 102714846.0, + "step": 234 + }, + { + "epoch": 0.1848937844217152, + "grad_norm": 0.357421875, + "learning_rate": 2.139303353571309e-06, + "loss": 0.5772, + "num_tokens": 103153235.0, + "step": 235 + }, + { + "epoch": 0.18568056648308418, + "grad_norm": 0.302734375, + "learning_rate": 2.1065202560039678e-06, + "loss": 0.5757, + "num_tokens": 103556743.0, + "step": 236 + }, + { + "epoch": 0.18646734854445318, + "grad_norm": 0.2890625, + "learning_rate": 2.0741494940904495e-06, + "loss": 0.5741, + "num_tokens": 103996580.0, + "step": 237 + }, + { + "epoch": 0.1872541306058222, + "grad_norm": 0.279296875, + "learning_rate": 2.0421950011441354e-06, + "loss": 0.5644, + "num_tokens": 104425967.0, + "step": 238 + }, + { + "epoch": 0.1880409126671912, + "grad_norm": 0.28515625, + "learning_rate": 2.0106606598983036e-06, + "loss": 0.5633, + "num_tokens": 104844108.0, + "step": 239 + }, + { + "epoch": 0.1888276947285602, + "grad_norm": 0.279296875, + "learning_rate": 1.9795503020343557e-06, + "loss": 0.5896, + "num_tokens": 105302805.0, + "step": 240 + }, + { + "epoch": 0.18961447678992918, + "grad_norm": 0.33203125, + "learning_rate": 1.94886770771623e-06, + "loss": 0.5854, + "num_tokens": 105708319.0, + "step": 241 + }, + { + "epoch": 0.1904012588512982, + "grad_norm": 0.296875, + "learning_rate": 1.9186166051310772e-06, + "loss": 0.5613, + "num_tokens": 106112247.0, + "step": 242 + }, + { + "epoch": 0.1911880409126672, + "grad_norm": 0.267578125, + "learning_rate": 1.8888006700362654e-06, + "loss": 0.5528, + "num_tokens": 106610177.0, + "step": 243 + }, + { + "epoch": 0.1919748229740362, + "grad_norm": 0.3125, + "learning_rate": 1.8594235253127373e-06, + "loss": 0.5574, + "num_tokens": 106966568.0, + "step": 244 + }, + { + "epoch": 0.19276160503540518, + "grad_norm": 0.275390625, + "learning_rate": 1.8304887405247986e-06, + "loss": 0.5738, + "num_tokens": 107408366.0, + "step": 245 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 0.26953125, + "learning_rate": 1.8019998314863974e-06, + "loss": 0.5744, + "num_tokens": 107911855.0, + "step": 246 + }, + { + "epoch": 0.1943351691581432, + "grad_norm": 0.314453125, + "learning_rate": 1.77396025983391e-06, + "loss": 0.5776, + "num_tokens": 108341668.0, + "step": 247 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 0.27734375, + "learning_rate": 1.7463734326055365e-06, + "loss": 0.5808, + "num_tokens": 108836854.0, + "step": 248 + }, + { + "epoch": 0.1959087332808812, + "grad_norm": 0.34765625, + "learning_rate": 1.7192427018273066e-06, + "loss": 0.5761, + "num_tokens": 109245431.0, + "step": 249 + }, + { + "epoch": 0.1966955153422502, + "grad_norm": 0.275390625, + "learning_rate": 1.6925713641057904e-06, + "loss": 0.5597, + "num_tokens": 109703188.0, + "step": 250 + }, + { + "epoch": 0.1974822974036192, + "grad_norm": 0.29296875, + "learning_rate": 1.666362660227529e-06, + "loss": 0.5857, + "num_tokens": 110128169.0, + "step": 251 + }, + { + "epoch": 0.1982690794649882, + "grad_norm": 0.287109375, + "learning_rate": 1.6406197747652485e-06, + "loss": 0.5891, + "num_tokens": 110581824.0, + "step": 252 + }, + { + "epoch": 0.1990558615263572, + "grad_norm": 0.29296875, + "learning_rate": 1.6153458356909177e-06, + "loss": 0.5529, + "num_tokens": 111007577.0, + "step": 253 + }, + { + "epoch": 0.1998426435877262, + "grad_norm": 0.263671875, + "learning_rate": 1.590543913995666e-06, + "loss": 0.5472, + "num_tokens": 111486780.0, + "step": 254 + }, + { + "epoch": 0.2006294256490952, + "grad_norm": 0.302734375, + "learning_rate": 1.5662170233166353e-06, + "loss": 0.5905, + "num_tokens": 111908526.0, + "step": 255 + }, + { + "epoch": 0.2014162077104642, + "grad_norm": 0.294921875, + "learning_rate": 1.5423681195707997e-06, + "loss": 0.5717, + "num_tokens": 112333164.0, + "step": 256 + }, + { + "epoch": 0.2022029897718332, + "grad_norm": 0.275390625, + "learning_rate": 1.5190001005957938e-06, + "loss": 0.5803, + "num_tokens": 112818479.0, + "step": 257 + }, + { + "epoch": 0.20298977183320221, + "grad_norm": 0.298828125, + "learning_rate": 1.4961158057978064e-06, + "loss": 0.5808, + "num_tokens": 113208745.0, + "step": 258 + }, + { + "epoch": 0.2037765538945712, + "grad_norm": 0.30859375, + "learning_rate": 1.4737180158065645e-06, + "loss": 0.6051, + "num_tokens": 113612135.0, + "step": 259 + }, + { + "epoch": 0.2045633359559402, + "grad_norm": 0.294921875, + "learning_rate": 1.4518094521374682e-06, + "loss": 0.5936, + "num_tokens": 114022639.0, + "step": 260 + }, + { + "epoch": 0.2053501180173092, + "grad_norm": 0.28125, + "learning_rate": 1.4303927768609016e-06, + "loss": 0.5694, + "num_tokens": 114460242.0, + "step": 261 + }, + { + "epoch": 0.20613690007867821, + "grad_norm": 0.267578125, + "learning_rate": 1.4094705922787688e-06, + "loss": 0.5386, + "num_tokens": 114905828.0, + "step": 262 + }, + { + "epoch": 0.2069236821400472, + "grad_norm": 0.28515625, + "learning_rate": 1.389045440608296e-06, + "loss": 0.5655, + "num_tokens": 115362240.0, + "step": 263 + }, + { + "epoch": 0.2077104642014162, + "grad_norm": 0.2890625, + "learning_rate": 1.3691198036731285e-06, + "loss": 0.5915, + "num_tokens": 115822643.0, + "step": 264 + }, + { + "epoch": 0.2084972462627852, + "grad_norm": 0.26953125, + "learning_rate": 1.3496961026017689e-06, + "loss": 0.5665, + "num_tokens": 116307039.0, + "step": 265 + }, + { + "epoch": 0.20928402832415421, + "grad_norm": 0.30078125, + "learning_rate": 1.3307766975333922e-06, + "loss": 0.5882, + "num_tokens": 116705870.0, + "step": 266 + }, + { + "epoch": 0.21007081038552322, + "grad_norm": 0.28515625, + "learning_rate": 1.3123638873310676e-06, + "loss": 0.5847, + "num_tokens": 117156035.0, + "step": 267 + }, + { + "epoch": 0.2108575924468922, + "grad_norm": 0.30859375, + "learning_rate": 1.2944599093024268e-06, + "loss": 0.5662, + "num_tokens": 117525103.0, + "step": 268 + }, + { + "epoch": 0.2116443745082612, + "grad_norm": 0.302734375, + "learning_rate": 1.277066938927816e-06, + "loss": 0.5949, + "num_tokens": 117912232.0, + "step": 269 + }, + { + "epoch": 0.21243115656963021, + "grad_norm": 0.279296875, + "learning_rate": 1.260187089595956e-06, + "loss": 0.5724, + "num_tokens": 118347405.0, + "step": 270 + }, + { + "epoch": 0.21321793863099922, + "grad_norm": 0.3203125, + "learning_rate": 1.2438224123471442e-06, + "loss": 0.6001, + "num_tokens": 118728218.0, + "step": 271 + }, + { + "epoch": 0.21400472069236823, + "grad_norm": 0.345703125, + "learning_rate": 1.2279748956240435e-06, + "loss": 0.5706, + "num_tokens": 119114776.0, + "step": 272 + }, + { + "epoch": 0.2147915027537372, + "grad_norm": 0.279296875, + "learning_rate": 1.2126464650300652e-06, + "loss": 0.5783, + "num_tokens": 119617143.0, + "step": 273 + }, + { + "epoch": 0.2155782848151062, + "grad_norm": 0.294921875, + "learning_rate": 1.1978389830953908e-06, + "loss": 0.5722, + "num_tokens": 120024450.0, + "step": 274 + }, + { + "epoch": 0.21636506687647522, + "grad_norm": 0.30859375, + "learning_rate": 1.1835542490506658e-06, + "loss": 0.5742, + "num_tokens": 120395245.0, + "step": 275 + }, + { + "epoch": 0.21715184893784423, + "grad_norm": 0.294921875, + "learning_rate": 1.1697939986083732e-06, + "loss": 0.565, + "num_tokens": 120854077.0, + "step": 276 + }, + { + "epoch": 0.2179386309992132, + "grad_norm": 0.30859375, + "learning_rate": 1.1565599037519317e-06, + "loss": 0.5903, + "num_tokens": 121277050.0, + "step": 277 + }, + { + "epoch": 0.2187254130605822, + "grad_norm": 0.30078125, + "learning_rate": 1.1438535725325342e-06, + "loss": 0.5744, + "num_tokens": 121651600.0, + "step": 278 + }, + { + "epoch": 0.21951219512195122, + "grad_norm": 0.47265625, + "learning_rate": 1.1316765488737602e-06, + "loss": 0.5641, + "num_tokens": 122132040.0, + "step": 279 + }, + { + "epoch": 0.22029897718332023, + "grad_norm": 0.275390625, + "learning_rate": 1.1200303123839744e-06, + "loss": 0.566, + "num_tokens": 122653140.0, + "step": 280 + }, + { + "epoch": 0.22108575924468923, + "grad_norm": 0.296875, + "learning_rate": 1.10891627817654e-06, + "loss": 0.5721, + "num_tokens": 123073782.0, + "step": 281 + }, + { + "epoch": 0.2218725413060582, + "grad_norm": 0.3671875, + "learning_rate": 1.0983357966978747e-06, + "loss": 0.6001, + "num_tokens": 123501000.0, + "step": 282 + }, + { + "epoch": 0.22265932336742722, + "grad_norm": 0.279296875, + "learning_rate": 1.088290153563358e-06, + "loss": 0.5673, + "num_tokens": 123967087.0, + "step": 283 + }, + { + "epoch": 0.22344610542879623, + "grad_norm": 0.271484375, + "learning_rate": 1.0787805694011185e-06, + "loss": 0.5559, + "num_tokens": 124432312.0, + "step": 284 + }, + { + "epoch": 0.22423288749016523, + "grad_norm": 0.30078125, + "learning_rate": 1.0698081997037178e-06, + "loss": 0.5893, + "num_tokens": 124898330.0, + "step": 285 + }, + { + "epoch": 0.2250196695515342, + "grad_norm": 0.296875, + "learning_rate": 1.0613741346877498e-06, + "loss": 0.5695, + "num_tokens": 125305283.0, + "step": 286 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 0.29296875, + "learning_rate": 1.053479399161368e-06, + "loss": 0.5538, + "num_tokens": 125723575.0, + "step": 287 + }, + { + "epoch": 0.22659323367427223, + "grad_norm": 0.314453125, + "learning_rate": 1.0461249523997647e-06, + "loss": 0.5838, + "num_tokens": 126109744.0, + "step": 288 + }, + { + "epoch": 0.22738001573564123, + "grad_norm": 0.29296875, + "learning_rate": 1.0393116880286117e-06, + "loss": 0.5723, + "num_tokens": 126532859.0, + "step": 289 + }, + { + "epoch": 0.22816679779701024, + "grad_norm": 0.2890625, + "learning_rate": 1.0330404339154763e-06, + "loss": 0.5648, + "num_tokens": 126984590.0, + "step": 290 + }, + { + "epoch": 0.22895357985837922, + "grad_norm": 0.291015625, + "learning_rate": 1.0273119520692274e-06, + "loss": 0.5862, + "num_tokens": 127423409.0, + "step": 291 + }, + { + "epoch": 0.22974036191974823, + "grad_norm": 0.28515625, + "learning_rate": 1.0221269385474486e-06, + "loss": 0.541, + "num_tokens": 127896042.0, + "step": 292 + }, + { + "epoch": 0.23052714398111723, + "grad_norm": 0.296875, + "learning_rate": 1.0174860233718585e-06, + "loss": 0.5795, + "num_tokens": 128311556.0, + "step": 293 + }, + { + "epoch": 0.23131392604248624, + "grad_norm": 0.296875, + "learning_rate": 1.0133897704517585e-06, + "loss": 0.5622, + "num_tokens": 128702300.0, + "step": 294 + }, + { + "epoch": 0.23210070810385522, + "grad_norm": 0.275390625, + "learning_rate": 1.0098386775155147e-06, + "loss": 0.5686, + "num_tokens": 129182359.0, + "step": 295 + }, + { + "epoch": 0.23288749016522423, + "grad_norm": 0.296875, + "learning_rate": 1.0068331760500773e-06, + "loss": 0.5527, + "num_tokens": 129563093.0, + "step": 296 + }, + { + "epoch": 0.23367427222659323, + "grad_norm": 0.296875, + "learning_rate": 1.0043736312485536e-06, + "loss": 0.5668, + "num_tokens": 129956060.0, + "step": 297 + }, + { + "epoch": 0.23446105428796224, + "grad_norm": 0.271484375, + "learning_rate": 1.0024603419658329e-06, + "loss": 0.5481, + "num_tokens": 130445808.0, + "step": 298 + }, + { + "epoch": 0.23524783634933125, + "grad_norm": 0.28515625, + "learning_rate": 1.0010935406822748e-06, + "loss": 0.567, + "num_tokens": 130875660.0, + "step": 299 + }, + { + "epoch": 0.23603461841070023, + "grad_norm": 0.30078125, + "learning_rate": 1.0002733934754567e-06, + "loss": 0.5759, + "num_tokens": 131289617.0, + "step": 300 + } + ], + "logging_steps": 1, + "max_steps": 300, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.862254349520732e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000..b669918 --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4e15c7bb0e02b9d7b5a9b58b0a7d2298c0df7decebb7c6298844e7f2abf8317 +size 6929 diff --git a/checkpoint-300/vocab.json b/checkpoint-300/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/checkpoint-300/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833 diff --git a/config.json b/config.json new file mode 100644 index 0000000..d43df39 --- /dev/null +++ b/config.json @@ -0,0 +1,68 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..d121c2f --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "4.57.1" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00002.safetensors b/model-00001-of-00002.safetensors new file mode 100644 index 0000000..5b36131 --- /dev/null +++ b/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aebf0a209015e60dfc5b0cc0b21d5a1e60711cba94717c9ad21e4bfc5187015e +size 4967215360 diff --git a/model-00002-of-00002.safetensors b/model-00002-of-00002.safetensors new file mode 100644 index 0000000..a871a82 --- /dev/null +++ b/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b92b40990ab20aae796f691ab7c429004391ce1ffb6543d28f0eb5152b153a8f +size 3077766632 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..b65d806 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,406 @@ +{ + "metadata": { + "total_parameters": 4022468096, + "total_size": 8044936192 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..ac23c0a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..cd71f61 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4 +size 11422654 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddaf698 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,239 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151666": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151667": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151668": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..c6f6086 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a381b24ff76f66219a5c6fc66410ddd3fcd84a7f7eac2bbc6125367d4da06f +size 6929 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833