commit c1749a33e372b53ae773e9906ffe9ab16b6ba9ab Author: ModelHub XC Date: Mon Apr 13 16:21:00 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: smirki/UIGEN-FX-4B-08-27-full Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d2fc868 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,51 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +merges.txt filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e0f95c5 --- /dev/null +++ b/README.md @@ -0,0 +1,23 @@ +--- +base_model: Unsloth/Qwen3-4B-Instruct-2507 +tags: +- text-generation-inference +- transformers +- unsloth +- qwen3 +- trl +- sft +license: apache-2.0 +language: +- en +--- + +# Uploaded model + +- **Developed by:** smirki +- **License:** apache-2.0 +- **Finetuned from model :** Unsloth/Qwen3-4B-Instruct-2507 + +This qwen3 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. + +[](https://github.com/unslothai/unsloth) diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..b54f913 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..465e19f --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,289 @@ +{% if 'role' in messages[0] %} +{%- if tools %} + {{- '<|im_start|>system +' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + ' + +' }} + {%- endif %} + {{- "# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: +" }} + {%- for tool in tools %} + {{- " +" }} + {{- tool | tojson }} + {%- endfor %} + {{- " + + +For each function call, return a json object with function name and arguments within XML tags: + +{\"name\": , \"arguments\": } +<|im_end|> +" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system +' + messages[0].content + '<|im_end|> +' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for forward_message in messages %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- set message = messages[index] %} + {%- set current_content = message.content if message.content is not none else '' %} + {%- set tool_start = '' %} + {%- set tool_start_length = tool_start|length %} + {%- set start_of_message = current_content[:tool_start_length] %} + {%- set tool_end = '' %} + {%- set tool_end_length = tool_end|length %} + {%- set start_pos = (current_content|length) - tool_end_length %} + {%- if start_pos < 0 %} + {%- set start_pos = 0 %} + {%- endif %} + {%- set end_of_message = current_content[start_pos:] %} + {%- if ns.multi_step_tool and message.role == "user" and not(start_of_message == tool_start and end_of_message == tool_end) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + ' +' + message.content + '<|im_end|>' + ' +' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = (message.content.split('')|last).lstrip(' +') %} + {%- set reasoning_content = (message.content.split('')|first).rstrip(' +') %} + {%- set reasoning_content = (reasoning_content.split('')|last).lstrip(' +') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + ' + +' + reasoning_content.strip(' +') + ' + + +' + content.lstrip(' +') }} + {%- else %} + {{- '<|im_start|>' + message.role + ' +' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + ' +' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- ' +' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- ' +{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '} +' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|> +' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- ' + +' }} + {{- message.content }} + {{- ' +' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|> +' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant +' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- ' + + + +' }} + {%- endif %} +{%- endif %} +{% else %} +{%- if tools %} + {{- '<|im_start|>system +' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + ' + +' }} + {%- endif %} + {{- "# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: +" }} + {%- for tool in tools %} + {{- " +" }} + {{- tool | tojson }} + {%- endfor %} + {{- " + + +For each function call, return a json object with function name and arguments within XML tags: + +{\"name\": , \"arguments\": } +<|im_end|> +" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system +' + messages[0].content + '<|im_end|> +' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for forward_message in messages %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- set message = messages[index] %} + {%- set current_content = message.content if message.content is not none else '' %} + {%- set tool_start = '' %} + {%- set tool_start_length = tool_start|length %} + {%- set start_of_message = current_content[:tool_start_length] %} + {%- set tool_end = '' %} + {%- set tool_end_length = tool_end|length %} + {%- set start_pos = (current_content|length) - tool_end_length %} + {%- if start_pos < 0 %} + {%- set start_pos = 0 %} + {%- endif %} + {%- set end_of_message = current_content[start_pos:] %} + {%- if ns.multi_step_tool and message.role == "user" and not(start_of_message == tool_start and end_of_message == tool_end) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + ' +' + message.content + '<|im_end|>' + ' +' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = (message.content.split('')|last).lstrip(' +') %} + {%- set reasoning_content = (message.content.split('')|first).rstrip(' +') %} + {%- set reasoning_content = (reasoning_content.split('')|last).lstrip(' +') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + ' + +' + reasoning_content.strip(' +') + ' + + +' + content.lstrip(' +') }} + {%- else %} + {{- '<|im_start|>' + message.role + ' +' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + ' +' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- ' +' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- ' +{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '} +' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|> +' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- ' + +' }} + {{- message.content }} + {{- ' +' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|> +' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant +' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- ' + + + +' }} + {%- endif %} +{%- endif %} +{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..18c0d87 --- /dev/null +++ b/config.json @@ -0,0 +1,70 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151654, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 5000000, + "sliding_window": null, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.55.4", + "unsloth_fixed": true, + "unsloth_version": "2025.8.10", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..e8c0b71 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "max_length": 262144, + "pad_token_id": 151654, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.55.4" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00002.safetensors b/model-00001-of-00002.safetensors new file mode 100644 index 0000000..6bc8505 --- /dev/null +++ b/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:949954f3302e7ab4caf2de4013c9ebad1fdc2f38fe2e72c129e71d0cc246abc1 +size 4967215360 diff --git a/model-00002-of-00002.safetensors b/model-00002-of-00002.safetensors new file mode 100644 index 0000000..7a0c5b9 --- /dev/null +++ b/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c06842070760200934a1818592a41c844bb6b9d8455328c7a3c395b4a6398b59 +size 3077766632 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..b65d806 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,406 @@ +{ + "metadata": { + "total_parameters": 4022468096, + "total_size": 8044936192 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/optimizer.pt b/optimizer.pt new file mode 100644 index 0000000..8ae4484 --- /dev/null +++ b/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74e7031bd8c1819974524d7bd07fec64676b8549056400a622e565b1460b4c14 +size 2553 diff --git a/rng_state.pth b/rng_state.pth new file mode 100644 index 0000000..ecd5163 --- /dev/null +++ b/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89645f9f6a2a8823b334e0748e4f5ea683a27b3d02632f0a241397db55d82fad +size 14645 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000..b30f887 --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9556b4bea5a867f0011734360e0dc8fcba4208c84231998238f7f5621aff7dd0 +size 1465 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9b8043f --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..cd71f61 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4 +size 11422654 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..da9f80f --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,240 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151666": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151667": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151668": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 262144, + "pad_token": "<|vision_pad|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..4974bb3 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1105 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1535, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03262642740619902, + "grad_norm": 0.0, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7799, + "step": 10 + }, + { + "epoch": 0.06525285481239804, + "grad_norm": 0.0, + "learning_rate": 7.600000000000001e-06, + "loss": 0.7788, + "step": 20 + }, + { + "epoch": 0.09787928221859707, + "grad_norm": 0.0, + "learning_rate": 1.16e-05, + "loss": 0.7558, + "step": 30 + }, + { + "epoch": 0.13050570962479607, + "grad_norm": 0.0, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.7929, + "step": 40 + }, + { + "epoch": 0.1631321370309951, + "grad_norm": 0.0, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.8025, + "step": 50 + }, + { + "epoch": 0.19575856443719414, + "grad_norm": 0.0, + "learning_rate": 1.999818745523526e-05, + "loss": 0.8121, + "step": 60 + }, + { + "epoch": 0.22838499184339314, + "grad_norm": 0.0, + "learning_rate": 1.9991922711960104e-05, + "loss": 0.768, + "step": 70 + }, + { + "epoch": 0.26101141924959215, + "grad_norm": 0.0, + "learning_rate": 1.998118619612634e-05, + "loss": 0.7567, + "step": 80 + }, + { + "epoch": 0.2936378466557912, + "grad_norm": 0.0, + "learning_rate": 1.996598271274081e-05, + "loss": 0.7997, + "step": 90 + }, + { + "epoch": 0.3262642740619902, + "grad_norm": 0.0, + "learning_rate": 1.9946319065951382e-05, + "loss": 0.7646, + "step": 100 + }, + { + "epoch": 0.35889070146818924, + "grad_norm": 0.0, + "learning_rate": 1.9922204056001896e-05, + "loss": 0.793, + "step": 110 + }, + { + "epoch": 0.3915171288743883, + "grad_norm": 0.0, + "learning_rate": 1.9893648475293646e-05, + "loss": 0.7884, + "step": 120 + }, + { + "epoch": 0.42414355628058725, + "grad_norm": 0.0, + "learning_rate": 1.9860665103555418e-05, + "loss": 0.7747, + "step": 130 + }, + { + "epoch": 0.4567699836867863, + "grad_norm": 0.0, + "learning_rate": 1.982326870212402e-05, + "loss": 0.7603, + "step": 140 + }, + { + "epoch": 0.4893964110929853, + "grad_norm": 0.0, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.7703, + "step": 150 + }, + { + "epoch": 0.5220228384991843, + "grad_norm": 0.0, + "learning_rate": 1.973530572304773e-05, + "loss": 0.7675, + "step": 160 + }, + { + "epoch": 0.5546492659053833, + "grad_norm": 0.0, + "learning_rate": 1.9684778512244172e-05, + "loss": 0.8043, + "step": 170 + }, + { + "epoch": 0.5872756933115824, + "grad_norm": 0.0, + "learning_rate": 1.9629916987811924e-05, + "loss": 0.8024, + "step": 180 + }, + { + "epoch": 0.6199021207177814, + "grad_norm": 0.0, + "learning_rate": 1.957074570240883e-05, + "loss": 0.7716, + "step": 190 + }, + { + "epoch": 0.6525285481239804, + "grad_norm": 0.0, + "learning_rate": 1.9507291137477744e-05, + "loss": 0.7651, + "step": 200 + }, + { + "epoch": 0.6851549755301795, + "grad_norm": 0.0, + "learning_rate": 1.943958169139507e-05, + "loss": 0.7819, + "step": 210 + }, + { + "epoch": 0.7177814029363785, + "grad_norm": 0.0, + "learning_rate": 1.9367647666761384e-05, + "loss": 0.7792, + "step": 220 + }, + { + "epoch": 0.7504078303425775, + "grad_norm": 0.0, + "learning_rate": 1.929152125683986e-05, + "loss": 0.7937, + "step": 230 + }, + { + "epoch": 0.7830342577487766, + "grad_norm": 0.0, + "learning_rate": 1.92112365311485e-05, + "loss": 0.7721, + "step": 240 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.0, + "learning_rate": 1.9126829420212764e-05, + "loss": 0.772, + "step": 250 + }, + { + "epoch": 0.8482871125611745, + "grad_norm": 0.0, + "learning_rate": 1.9038337699485207e-05, + "loss": 0.7611, + "step": 260 + }, + { + "epoch": 0.8809135399673735, + "grad_norm": 0.0, + "learning_rate": 1.894580097243954e-05, + "loss": 0.7829, + "step": 270 + }, + { + "epoch": 0.9135399673735726, + "grad_norm": 0.0, + "learning_rate": 1.884926065284652e-05, + "loss": 0.7815, + "step": 280 + }, + { + "epoch": 0.9461663947797716, + "grad_norm": 0.0, + "learning_rate": 1.87487599462397e-05, + "loss": 0.7742, + "step": 290 + }, + { + "epoch": 0.9787928221859706, + "grad_norm": 0.0, + "learning_rate": 1.864434383057927e-05, + "loss": 0.7561, + "step": 300 + }, + { + "epoch": 1.0097879282218598, + "grad_norm": 0.0, + "learning_rate": 1.853605903612267e-05, + "loss": 0.7452, + "step": 310 + }, + { + "epoch": 1.0424143556280587, + "grad_norm": 0.0, + "learning_rate": 1.8423954024510995e-05, + "loss": 0.7773, + "step": 320 + }, + { + "epoch": 1.0750407830342577, + "grad_norm": 0.0, + "learning_rate": 1.8308078967080547e-05, + "loss": 0.8153, + "step": 330 + }, + { + "epoch": 1.1076672104404568, + "grad_norm": 0.0, + "learning_rate": 1.8188485722409196e-05, + "loss": 0.793, + "step": 340 + }, + { + "epoch": 1.1402936378466557, + "grad_norm": 0.0, + "learning_rate": 1.8065227813107667e-05, + "loss": 0.7822, + "step": 350 + }, + { + "epoch": 1.1729200652528549, + "grad_norm": 0.0, + "learning_rate": 1.7938360401866096e-05, + "loss": 0.7703, + "step": 360 + }, + { + "epoch": 1.2055464926590538, + "grad_norm": 0.0, + "learning_rate": 1.7807940266766595e-05, + "loss": 0.78, + "step": 370 + }, + { + "epoch": 1.238172920065253, + "grad_norm": 0.0, + "learning_rate": 1.767402577587285e-05, + "loss": 0.7718, + "step": 380 + }, + { + "epoch": 1.2707993474714518, + "grad_norm": 0.0, + "learning_rate": 1.7536676861108167e-05, + "loss": 0.7895, + "step": 390 + }, + { + "epoch": 1.3034257748776508, + "grad_norm": 0.0, + "learning_rate": 1.7395954991433588e-05, + "loss": 0.7638, + "step": 400 + }, + { + "epoch": 1.33605220228385, + "grad_norm": 0.0, + "learning_rate": 1.7251923145338175e-05, + "loss": 0.7874, + "step": 410 + }, + { + "epoch": 1.368678629690049, + "grad_norm": 0.0, + "learning_rate": 1.710464578265369e-05, + "loss": 0.7947, + "step": 420 + }, + { + "epoch": 1.401305057096248, + "grad_norm": 0.0, + "learning_rate": 1.6954188815706306e-05, + "loss": 0.7811, + "step": 430 + }, + { + "epoch": 1.433931484502447, + "grad_norm": 0.0, + "learning_rate": 1.680061957981831e-05, + "loss": 0.736, + "step": 440 + }, + { + "epoch": 1.466557911908646, + "grad_norm": 0.0, + "learning_rate": 1.6644006803172926e-05, + "loss": 0.765, + "step": 450 + }, + { + "epoch": 1.499184339314845, + "grad_norm": 0.0, + "learning_rate": 1.6484420576055787e-05, + "loss": 0.7608, + "step": 460 + }, + { + "epoch": 1.531810766721044, + "grad_norm": 0.0, + "learning_rate": 1.6321932319486822e-05, + "loss": 0.779, + "step": 470 + }, + { + "epoch": 1.564437194127243, + "grad_norm": 0.0, + "learning_rate": 1.6156614753256583e-05, + "loss": 0.7824, + "step": 480 + }, + { + "epoch": 1.597063621533442, + "grad_norm": 0.0, + "learning_rate": 1.5988541863381323e-05, + "loss": 0.7859, + "step": 490 + }, + { + "epoch": 1.629690048939641, + "grad_norm": 0.0, + "learning_rate": 1.581778886899138e-05, + "loss": 0.784, + "step": 500 + }, + { + "epoch": 1.6623164763458402, + "grad_norm": 0.0, + "learning_rate": 1.5644432188667695e-05, + "loss": 0.7578, + "step": 510 + }, + { + "epoch": 1.6949429037520392, + "grad_norm": 0.0, + "learning_rate": 1.546854940624156e-05, + "loss": 0.7779, + "step": 520 + }, + { + "epoch": 1.727569331158238, + "grad_norm": 0.0, + "learning_rate": 1.5290219236072833e-05, + "loss": 0.7667, + "step": 530 + }, + { + "epoch": 1.7601957585644372, + "grad_norm": 0.0, + "learning_rate": 1.5109521487822208e-05, + "loss": 0.7765, + "step": 540 + }, + { + "epoch": 1.7928221859706364, + "grad_norm": 0.0, + "learning_rate": 1.4926537030733301e-05, + "loss": 0.8005, + "step": 550 + }, + { + "epoch": 1.8254486133768353, + "grad_norm": 0.0, + "learning_rate": 1.474134775744054e-05, + "loss": 0.7501, + "step": 560 + }, + { + "epoch": 1.8580750407830342, + "grad_norm": 0.0, + "learning_rate": 1.4554036547319033e-05, + "loss": 0.7968, + "step": 570 + }, + { + "epoch": 1.8907014681892331, + "grad_norm": 0.0, + "learning_rate": 1.4364687229392823e-05, + "loss": 0.7676, + "step": 580 + }, + { + "epoch": 1.9233278955954323, + "grad_norm": 0.0, + "learning_rate": 1.417338454481818e-05, + "loss": 0.8098, + "step": 590 + }, + { + "epoch": 1.9559543230016314, + "grad_norm": 0.0, + "learning_rate": 1.3980214108958626e-05, + "loss": 0.7602, + "step": 600 + }, + { + "epoch": 1.9885807504078303, + "grad_norm": 0.0, + "learning_rate": 1.3785262373068742e-05, + "loss": 0.78, + "step": 610 + }, + { + "epoch": 2.0195758564437196, + "grad_norm": 0.0, + "learning_rate": 1.3588616585603908e-05, + "loss": 0.79, + "step": 620 + }, + { + "epoch": 2.0522022838499185, + "grad_norm": 0.0, + "learning_rate": 1.3390364753173206e-05, + "loss": 0.7759, + "step": 630 + }, + { + "epoch": 2.0848287112561175, + "grad_norm": 0.0, + "learning_rate": 1.319059560115308e-05, + "loss": 0.7811, + "step": 640 + }, + { + "epoch": 2.1174551386623164, + "grad_norm": 0.0, + "learning_rate": 1.2989398533979271e-05, + "loss": 0.793, + "step": 650 + }, + { + "epoch": 2.1500815660685153, + "grad_norm": 0.0, + "learning_rate": 1.278686359513488e-05, + "loss": 0.7435, + "step": 660 + }, + { + "epoch": 2.1827079934747147, + "grad_norm": 0.0, + "learning_rate": 1.2583081426852412e-05, + "loss": 0.7775, + "step": 670 + }, + { + "epoch": 2.2153344208809136, + "grad_norm": 0.0, + "learning_rate": 1.237814322954788e-05, + "loss": 0.7885, + "step": 680 + }, + { + "epoch": 2.2479608482871125, + "grad_norm": 0.0, + "learning_rate": 1.217214072100508e-05, + "loss": 0.7745, + "step": 690 + }, + { + "epoch": 2.2805872756933114, + "grad_norm": 0.0, + "learning_rate": 1.1965166095328302e-05, + "loss": 0.7463, + "step": 700 + }, + { + "epoch": 2.3132137030995104, + "grad_norm": 0.0, + "learning_rate": 1.1757311981681943e-05, + "loss": 0.7962, + "step": 710 + }, + { + "epoch": 2.3458401305057097, + "grad_norm": 0.0, + "learning_rate": 1.1548671402835325e-05, + "loss": 0.7699, + "step": 720 + }, + { + "epoch": 2.3784665579119086, + "grad_norm": 0.0, + "learning_rate": 1.1339337733531435e-05, + "loss": 0.8087, + "step": 730 + }, + { + "epoch": 2.4110929853181076, + "grad_norm": 0.0, + "learning_rate": 1.1129404658698082e-05, + "loss": 0.7399, + "step": 740 + }, + { + "epoch": 2.443719412724307, + "grad_norm": 0.0, + "learning_rate": 1.0918966131520276e-05, + "loss": 0.7841, + "step": 750 + }, + { + "epoch": 2.476345840130506, + "grad_norm": 0.0, + "learning_rate": 1.0708116331392542e-05, + "loss": 0.7998, + "step": 760 + }, + { + "epoch": 2.5089722675367048, + "grad_norm": 0.0, + "learning_rate": 1.0496949621769976e-05, + "loss": 0.7869, + "step": 770 + }, + { + "epoch": 2.5415986949429037, + "grad_norm": 0.0, + "learning_rate": 1.0285560507936962e-05, + "loss": 0.789, + "step": 780 + }, + { + "epoch": 2.5742251223491026, + "grad_norm": 0.0, + "learning_rate": 1.007404359471238e-05, + "loss": 0.7694, + "step": 790 + }, + { + "epoch": 2.6068515497553015, + "grad_norm": 0.0, + "learning_rate": 9.862493544110282e-06, + "loss": 0.7746, + "step": 800 + }, + { + "epoch": 2.639477977161501, + "grad_norm": 0.0, + "learning_rate": 9.651005032974994e-06, + "loss": 0.7776, + "step": 810 + }, + { + "epoch": 2.6721044045677, + "grad_norm": 0.0, + "learning_rate": 9.439672710609532e-06, + "loss": 0.8017, + "step": 820 + }, + { + "epoch": 2.7047308319738987, + "grad_norm": 0.0, + "learning_rate": 9.228591156416405e-06, + "loss": 0.7494, + "step": 830 + }, + { + "epoch": 2.737357259380098, + "grad_norm": 0.0, + "learning_rate": 9.017854837569629e-06, + "loss": 0.7635, + "step": 840 + }, + { + "epoch": 2.769983686786297, + "grad_norm": 0.0, + "learning_rate": 8.807558066737042e-06, + "loss": 0.7947, + "step": 850 + }, + { + "epoch": 2.802610114192496, + "grad_norm": 0.0, + "learning_rate": 8.597794959871694e-06, + "loss": 0.7897, + "step": 860 + }, + { + "epoch": 2.835236541598695, + "grad_norm": 0.0, + "learning_rate": 8.388659394091362e-06, + "loss": 0.7715, + "step": 870 + }, + { + "epoch": 2.867862969004894, + "grad_norm": 0.0, + "learning_rate": 8.180244965664845e-06, + "loss": 0.7685, + "step": 880 + }, + { + "epoch": 2.9004893964110927, + "grad_norm": 0.0, + "learning_rate": 7.97264494812405e-06, + "loss": 0.7456, + "step": 890 + }, + { + "epoch": 2.933115823817292, + "grad_norm": 0.0, + "learning_rate": 7.765952250520459e-06, + "loss": 0.8071, + "step": 900 + }, + { + "epoch": 2.965742251223491, + "grad_norm": 0.0, + "learning_rate": 7.560259375844719e-06, + "loss": 0.7667, + "step": 910 + }, + { + "epoch": 2.99836867862969, + "grad_norm": 0.0, + "learning_rate": 7.355658379627981e-06, + "loss": 0.764, + "step": 920 + }, + { + "epoch": 3.029363784665579, + "grad_norm": 0.0, + "learning_rate": 7.1522408287434774e-06, + "loss": 0.8021, + "step": 930 + }, + { + "epoch": 3.061990212071778, + "grad_norm": 0.0, + "learning_rate": 6.950097760426814e-06, + "loss": 0.7764, + "step": 940 + }, + { + "epoch": 3.094616639477977, + "grad_norm": 0.0, + "learning_rate": 6.74931964153325e-06, + "loss": 0.8317, + "step": 950 + }, + { + "epoch": 3.1272430668841764, + "grad_norm": 0.0, + "learning_rate": 6.549996328050296e-06, + "loss": 0.789, + "step": 960 + }, + { + "epoch": 3.1598694942903753, + "grad_norm": 0.0, + "learning_rate": 6.352217024883678e-06, + "loss": 0.7928, + "step": 970 + }, + { + "epoch": 3.1924959216965743, + "grad_norm": 0.0, + "learning_rate": 6.1560702459346845e-06, + "loss": 0.7768, + "step": 980 + }, + { + "epoch": 3.225122349102773, + "grad_norm": 0.0, + "learning_rate": 5.961643774486754e-06, + "loss": 0.7542, + "step": 990 + }, + { + "epoch": 3.257748776508972, + "grad_norm": 0.0, + "learning_rate": 5.769024623919064e-06, + "loss": 0.7807, + "step": 1000 + }, + { + "epoch": 3.2903752039151715, + "grad_norm": 0.0, + "learning_rate": 5.57829899876469e-06, + "loss": 0.7849, + "step": 1010 + }, + { + "epoch": 3.3230016313213704, + "grad_norm": 0.0, + "learning_rate": 5.38955225613069e-06, + "loss": 0.78, + "step": 1020 + }, + { + "epoch": 3.3556280587275693, + "grad_norm": 0.0, + "learning_rate": 5.202868867497542e-06, + "loss": 0.777, + "step": 1030 + }, + { + "epoch": 3.3882544861337682, + "grad_norm": 0.0, + "learning_rate": 5.01833238091485e-06, + "loss": 0.7735, + "step": 1040 + }, + { + "epoch": 3.4208809135399676, + "grad_norm": 0.0, + "learning_rate": 4.836025383610382e-06, + "loss": 0.7732, + "step": 1050 + }, + { + "epoch": 3.4535073409461665, + "grad_norm": 0.0, + "learning_rate": 4.656029465029057e-06, + "loss": 0.7516, + "step": 1060 + }, + { + "epoch": 3.4861337683523654, + "grad_norm": 0.0, + "learning_rate": 4.478425180318523e-06, + "loss": 0.7534, + "step": 1070 + }, + { + "epoch": 3.5187601957585644, + "grad_norm": 0.0, + "learning_rate": 4.3032920142776125e-06, + "loss": 0.7672, + "step": 1080 + }, + { + "epoch": 3.5513866231647633, + "grad_norm": 0.0, + "learning_rate": 4.1307083457838004e-06, + "loss": 0.7406, + "step": 1090 + }, + { + "epoch": 3.5840130505709626, + "grad_norm": 0.0, + "learning_rate": 3.960751412715629e-06, + "loss": 0.82, + "step": 1100 + }, + { + "epoch": 3.6166394779771616, + "grad_norm": 0.0, + "learning_rate": 3.7934972773857637e-06, + "loss": 0.7934, + "step": 1110 + }, + { + "epoch": 3.6492659053833605, + "grad_norm": 0.0, + "learning_rate": 3.6290207925001585e-06, + "loss": 0.7772, + "step": 1120 + }, + { + "epoch": 3.6818923327895594, + "grad_norm": 0.0, + "learning_rate": 3.4673955676585734e-06, + "loss": 0.7678, + "step": 1130 + }, + { + "epoch": 3.7145187601957588, + "grad_norm": 0.0, + "learning_rate": 3.308693936411421e-06, + "loss": 0.7717, + "step": 1140 + }, + { + "epoch": 3.7471451876019577, + "grad_norm": 0.0, + "learning_rate": 3.152986923887703e-06, + "loss": 0.7977, + "step": 1150 + }, + { + "epoch": 3.7797716150081566, + "grad_norm": 0.0, + "learning_rate": 3.000344215008524e-06, + "loss": 0.76, + "step": 1160 + }, + { + "epoch": 3.8123980424143555, + "grad_norm": 0.0, + "learning_rate": 2.8508341233003656e-06, + "loss": 0.7893, + "step": 1170 + }, + { + "epoch": 3.8450244698205545, + "grad_norm": 0.0, + "learning_rate": 2.7045235603221533e-06, + "loss": 0.7612, + "step": 1180 + }, + { + "epoch": 3.877650897226754, + "grad_norm": 0.0, + "learning_rate": 2.561478005719743e-06, + "loss": 0.7541, + "step": 1190 + }, + { + "epoch": 3.9102773246329527, + "grad_norm": 0.0, + "learning_rate": 2.421761477921232e-06, + "loss": 0.7643, + "step": 1200 + }, + { + "epoch": 3.9429037520391517, + "grad_norm": 0.0, + "learning_rate": 2.2854365054862383e-06, + "loss": 0.7838, + "step": 1210 + }, + { + "epoch": 3.9755301794453506, + "grad_norm": 0.0, + "learning_rate": 2.152564099121944e-06, + "loss": 0.788, + "step": 1220 + }, + { + "epoch": 4.006525285481239, + "grad_norm": 0.0, + "learning_rate": 2.0232037243784475e-06, + "loss": 0.7716, + "step": 1230 + }, + { + "epoch": 4.039151712887439, + "grad_norm": 0.0, + "learning_rate": 1.8974132750356156e-06, + "loss": 0.792, + "step": 1240 + }, + { + "epoch": 4.071778140293638, + "grad_norm": 0.0, + "learning_rate": 1.7752490471933769e-06, + "loss": 0.768, + "step": 1250 + }, + { + "epoch": 4.104404567699837, + "grad_norm": 0.0, + "learning_rate": 1.6567657140770477e-06, + "loss": 0.7654, + "step": 1260 + }, + { + "epoch": 4.137030995106036, + "grad_norm": 0.0, + "learning_rate": 1.542016301568926e-06, + "loss": 0.7698, + "step": 1270 + }, + { + "epoch": 4.169657422512235, + "grad_norm": 0.0, + "learning_rate": 1.4310521644771657e-06, + "loss": 0.745, + "step": 1280 + }, + { + "epoch": 4.202283849918434, + "grad_norm": 0.0, + "learning_rate": 1.3239229635525074e-06, + "loss": 0.7774, + "step": 1290 + }, + { + "epoch": 4.234910277324633, + "grad_norm": 0.0, + "learning_rate": 1.2206766432631766e-06, + "loss": 0.7848, + "step": 1300 + }, + { + "epoch": 4.267536704730832, + "grad_norm": 0.0, + "learning_rate": 1.121359410337859e-06, + "loss": 0.7814, + "step": 1310 + }, + { + "epoch": 4.300163132137031, + "grad_norm": 0.0, + "learning_rate": 1.0260157130864178e-06, + "loss": 0.809, + "step": 1320 + }, + { + "epoch": 4.33278955954323, + "grad_norm": 0.0, + "learning_rate": 9.346882215075348e-07, + "loss": 0.7976, + "step": 1330 + }, + { + "epoch": 4.365415986949429, + "grad_norm": 0.0, + "learning_rate": 8.474178081922524e-07, + "loss": 0.7825, + "step": 1340 + }, + { + "epoch": 4.398042414355628, + "grad_norm": 0.0, + "learning_rate": 7.642435300318906e-07, + "loss": 0.7712, + "step": 1350 + }, + { + "epoch": 4.430668841761827, + "grad_norm": 0.0, + "learning_rate": 6.852026107385756e-07, + "loss": 0.7711, + "step": 1360 + }, + { + "epoch": 4.463295269168026, + "grad_norm": 0.0, + "learning_rate": 6.103304241862006e-07, + "loss": 0.7903, + "step": 1370 + }, + { + "epoch": 4.495921696574225, + "grad_norm": 0.0, + "learning_rate": 5.396604785792281e-07, + "loss": 0.7527, + "step": 1380 + }, + { + "epoch": 4.528548123980424, + "grad_norm": 0.0, + "learning_rate": 4.7322440145647905e-07, + "loss": 0.7781, + "step": 1390 + }, + { + "epoch": 4.561174551386623, + "grad_norm": 0.0, + "learning_rate": 4.110519255365852e-07, + "loss": 0.8016, + "step": 1400 + }, + { + "epoch": 4.593800978792823, + "grad_norm": 0.0, + "learning_rate": 3.531708754114438e-07, + "loss": 0.7768, + "step": 1410 + }, + { + "epoch": 4.626427406199021, + "grad_norm": 0.0, + "learning_rate": 2.996071550936319e-07, + "loss": 0.7688, + "step": 1420 + }, + { + "epoch": 4.6590538336052205, + "grad_norm": 0.0, + "learning_rate": 2.503847364233614e-07, + "loss": 0.8049, + "step": 1430 + }, + { + "epoch": 4.691680261011419, + "grad_norm": 0.0, + "learning_rate": 2.0552564834014797e-07, + "loss": 0.7818, + "step": 1440 + }, + { + "epoch": 4.724306688417618, + "grad_norm": 0.0, + "learning_rate": 1.6504996702401243e-07, + "loss": 0.7737, + "step": 1450 + }, + { + "epoch": 4.756933115823817, + "grad_norm": 0.0, + "learning_rate": 1.2897580691060506e-07, + "loss": 0.8014, + "step": 1460 + }, + { + "epoch": 4.789559543230016, + "grad_norm": 0.0, + "learning_rate": 9.731931258429638e-08, + "loss": 0.7563, + "step": 1470 + }, + { + "epoch": 4.822185970636215, + "grad_norm": 0.0, + "learning_rate": 7.009465155285777e-08, + "loss": 0.7504, + "step": 1480 + }, + { + "epoch": 4.854812398042414, + "grad_norm": 0.0, + "learning_rate": 4.731400790693785e-08, + "loss": 0.7879, + "step": 1490 + }, + { + "epoch": 4.887438825448614, + "grad_norm": 0.0, + "learning_rate": 2.898757686722542e-08, + "loss": 0.7755, + "step": 1500 + }, + { + "epoch": 4.920065252854813, + "grad_norm": 0.0, + "learning_rate": 1.5123560221681488e-08, + "loss": 0.7803, + "step": 1510 + }, + { + "epoch": 4.952691680261012, + "grad_norm": 0.0, + "learning_rate": 5.728162654927705e-09, + "loss": 0.7493, + "step": 1520 + }, + { + "epoch": 4.985318107667211, + "grad_norm": 0.0, + "learning_rate": 8.05588971406479e-10, + "loss": 0.7814, + "step": 1530 + } + ], + "logging_steps": 10, + "max_steps": 1535, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.877356359182975e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..126f39b --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dc458d5c020e7f45611ca9984cda2f300de37b0df78ae2c52365842a40434e6 +size 6161 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833