commit 021c63722ec18a1453e589895f6aa587f84443b2 Author: ModelHub XC Date: Fri May 22 17:27:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: hoornet/nives-fg-270m-v1 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9cb588f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,39 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-897/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +nives-fg-270m-v1-F16.gguf filter=lfs diff=lfs merge=lfs -text +nives-fg-270m-v1-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..91e5ccb --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ +--- +base_model: google/functiongemma-270m-it +library_name: transformers +model_name: fg270m-nives-ft-v1 +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for fg270m-nives-ft-v1 + +This model is a fine-tuned version of [google/functiongemma-270m-it](https://huggingface.co/google/functiongemma-270m-it). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.3.0 +- Transformers: 5.7.0 +- Pytorch: 2.4.1+cu124 +- Datasets: 4.8.5 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..1629479 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,279 @@ +{%- macro format_parameters(properties, required) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- if key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{- key }}:{description:{{ value['description'] }} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + ,enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'OBJECT' -%} + ,properties:{ + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + {%- elif value is mapping -%} + {{- format_parameters(value, value['required'] | default([])) -}} + {%- endif -%} + } + {%- if value['required'] -%} + ,required:[ + {%- for item in value['required'] | default([]) -%} + {{- item -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + ,items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + {{- req_item -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + ,type:{{ value['type'] | upper }}} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{% macro format_function_declaration(tool_data) -%} +declaration:{{- tool_data['function']['name'] -}} +{description:{{- tool_data['function']['description'] -}} +{%- set params = tool_data['function']['parameters'] -%} +{%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + {{- item -}} + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:{{- params['type'] | upper -}}} + {%- endif -%} +{%- endif -%} +} +{%- endmacro -%} +{% macro format_argument(argument, escape_keys=True) -%} +{%- if argument is string -%} + {{- '' + argument + '' -}} +{%- elif argument is boolean -%} + {%- if argument -%} + {{- 'true' -}} + {%- else -%} + {{- 'false' -}} + {%- endif -%} +{%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '' + key + '' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} +{%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} +{%- else -%} + {{- argument -}} +{%- endif -%} +{%- endmacro -%} +{{ bos_token }} +{%- set ns = namespace(prev_message_type=None) -%} +{#- Tool Declarations -#} +{%- set loop_messages = messages -%} +{%- if tools or messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%} + {{- 'developer\n' -}} + {%- if messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {%- if item['type'] == 'text' -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '' -}} + {{- format_function_declaration(tool) | trim }} + {{- '' -}} + {%- endfor %} + {%- endif -%} + {{- '\n' }} +{%- endif %} +{#- Loop through messages. -#} +{%- for message in loop_messages -%} + {%- if (message['role'] == 'assistant') -%} + {#- Rename "assistant" to "model". -#} + {%- set role = "model" -%} + {%- else -%} + {%- set role = message['role'] -%} + {%- endif -%} + {%- if role != 'tool' -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- '' + role + '\n' }} + {%- endif -%} + {%- set ns.prev_message_type = None -%} + {%- if 'content' in message and message['content'] is not none -%} + {%- if message['content'] is string -%} + {{ message['content'] | trim }} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'image' -%} + {{ '' }} + {%- elif item['type'] == 'text' -%} + {{ item['text'] | trim }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{ raise_exception("Invalid content type in user/assistant message") }} + {%- endif -%} + {%- set ns.prev_message_type = 'content' -%} + {%- endif -%} + {%- if 'tool_calls' in message and message['tool_calls'] and message['tool_calls'] is iterable -%} + {#- Tool Calls -#} + {%- for tool_call in message['tool_calls'] -%} + {% set function = tool_call['function'] %} + {{- 'call:' + function['name'] + '{' -}} + {%- if 'arguments' in function -%} + {%- if function['arguments'] is mapping -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {# This handles string-JSON, just in case #} + {{ function['arguments'] }} + {%- endif %} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- if loop.last -%} + {{ '' }} + {%- endif -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + {%- else -%} + {#- Tool Responses -#} + {%- if 'content' in message and message['content'] -%} + {%- if message['content'] is mapping -%} + {%- if 'name' in message['content'] and 'response' in message['content'] -%} + {{ 'response:' + message['content']['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in message['content']['response'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in message['content'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }} + {%- endif -%} + {%- elif message['content'] is string -%} + {%- if 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{value:' + format_argument(message['content'], escape_keys=False) + '}' }} + {%- else -%} + {{ raise_exception("Invalid tool response: 'name' must be provided.") }} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item is mapping -%} + {%- if 'name' in item and 'response' in item -%} + {{ 'response:' + item['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in item['response'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in item | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }} + {%- endif -%} + {%- else -%} + {{ raise_exception("Invalid tool response message: multiple responses must all be mappings") }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{ raise_exception("Invalid content type in tool message: must be mapping, sequence of mappings, or string.") }} + {%- endif -%} + {%- endif -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- if ns.prev_message_type not in ['tool_call', 'tool_response'] -%} + {{ '\n' }} + {%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- 'model\n' -}} + {%- endif -%} +{%- endif -%} diff --git a/checkpoint-897/chat_template.jinja b/checkpoint-897/chat_template.jinja new file mode 100644 index 0000000..1629479 --- /dev/null +++ b/checkpoint-897/chat_template.jinja @@ -0,0 +1,279 @@ +{%- macro format_parameters(properties, required) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- if key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{- key }}:{description:{{ value['description'] }} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + ,enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'OBJECT' -%} + ,properties:{ + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + {%- elif value is mapping -%} + {{- format_parameters(value, value['required'] | default([])) -}} + {%- endif -%} + } + {%- if value['required'] -%} + ,required:[ + {%- for item in value['required'] | default([]) -%} + {{- item -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + ,items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + {{- req_item -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + ,type:{{ value['type'] | upper }}} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{% macro format_function_declaration(tool_data) -%} +declaration:{{- tool_data['function']['name'] -}} +{description:{{- tool_data['function']['description'] -}} +{%- set params = tool_data['function']['parameters'] -%} +{%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + {{- item -}} + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:{{- params['type'] | upper -}}} + {%- endif -%} +{%- endif -%} +} +{%- endmacro -%} +{% macro format_argument(argument, escape_keys=True) -%} +{%- if argument is string -%} + {{- '' + argument + '' -}} +{%- elif argument is boolean -%} + {%- if argument -%} + {{- 'true' -}} + {%- else -%} + {{- 'false' -}} + {%- endif -%} +{%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '' + key + '' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} +{%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} +{%- else -%} + {{- argument -}} +{%- endif -%} +{%- endmacro -%} +{{ bos_token }} +{%- set ns = namespace(prev_message_type=None) -%} +{#- Tool Declarations -#} +{%- set loop_messages = messages -%} +{%- if tools or messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%} + {{- 'developer\n' -}} + {%- if messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {%- if item['type'] == 'text' -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '' -}} + {{- format_function_declaration(tool) | trim }} + {{- '' -}} + {%- endfor %} + {%- endif -%} + {{- '\n' }} +{%- endif %} +{#- Loop through messages. -#} +{%- for message in loop_messages -%} + {%- if (message['role'] == 'assistant') -%} + {#- Rename "assistant" to "model". -#} + {%- set role = "model" -%} + {%- else -%} + {%- set role = message['role'] -%} + {%- endif -%} + {%- if role != 'tool' -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- '' + role + '\n' }} + {%- endif -%} + {%- set ns.prev_message_type = None -%} + {%- if 'content' in message and message['content'] is not none -%} + {%- if message['content'] is string -%} + {{ message['content'] | trim }} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'image' -%} + {{ '' }} + {%- elif item['type'] == 'text' -%} + {{ item['text'] | trim }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{ raise_exception("Invalid content type in user/assistant message") }} + {%- endif -%} + {%- set ns.prev_message_type = 'content' -%} + {%- endif -%} + {%- if 'tool_calls' in message and message['tool_calls'] and message['tool_calls'] is iterable -%} + {#- Tool Calls -#} + {%- for tool_call in message['tool_calls'] -%} + {% set function = tool_call['function'] %} + {{- 'call:' + function['name'] + '{' -}} + {%- if 'arguments' in function -%} + {%- if function['arguments'] is mapping -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {# This handles string-JSON, just in case #} + {{ function['arguments'] }} + {%- endif %} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- if loop.last -%} + {{ '' }} + {%- endif -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + {%- else -%} + {#- Tool Responses -#} + {%- if 'content' in message and message['content'] -%} + {%- if message['content'] is mapping -%} + {%- if 'name' in message['content'] and 'response' in message['content'] -%} + {{ 'response:' + message['content']['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in message['content']['response'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in message['content'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }} + {%- endif -%} + {%- elif message['content'] is string -%} + {%- if 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{value:' + format_argument(message['content'], escape_keys=False) + '}' }} + {%- else -%} + {{ raise_exception("Invalid tool response: 'name' must be provided.") }} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item is mapping -%} + {%- if 'name' in item and 'response' in item -%} + {{ 'response:' + item['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in item['response'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in item | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }} + {%- endif -%} + {%- else -%} + {{ raise_exception("Invalid tool response message: multiple responses must all be mappings") }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{ raise_exception("Invalid content type in tool message: must be mapping, sequence of mappings, or string.") }} + {%- endif -%} + {%- endif -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- if ns.prev_message_type not in ['tool_call', 'tool_response'] -%} + {{ '\n' }} + {%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- 'model\n' -}} + {%- endif -%} +{%- endif -%} diff --git a/checkpoint-897/config.json b/checkpoint-897/config.json new file mode 100644 index 0000000..662be51 --- /dev/null +++ b/checkpoint-897/config.json @@ -0,0 +1,62 @@ +{ + "_sliding_window_pattern": 6, + "architectures": [ + "Gemma3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attn_logit_softcapping": null, + "bos_token_id": 2, + "dtype": "bfloat16", + "eos_token_id": 1, + "final_logit_softcapping": null, + "head_dim": 256, + "hidden_activation": "gelu_pytorch_tanh", + "hidden_size": 640, + "initializer_range": 0.02, + "intermediate_size": 2048, + "layer_types": [ + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "model_type": "gemma3_text", + "num_attention_heads": 4, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "pad_token_id": 0, + "query_pre_attn_scalar": 256, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "full_attention": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_attention": { + "rope_theta": 10000.0, + "rope_type": "default" + } + }, + "sliding_window": 512, + "tie_word_embeddings": true, + "transformers_version": "5.7.0", + "use_bidirectional_attention": false, + "use_cache": false, + "vocab_size": 262144 +} diff --git a/checkpoint-897/generation_config.json b/checkpoint-897/generation_config.json new file mode 100644 index 0000000..69cb412 --- /dev/null +++ b/checkpoint-897/generation_config.json @@ -0,0 +1,15 @@ +{ + "bos_token_id": 2, + "cache_implementation": "hybrid", + "do_sample": true, + "eos_token_id": [ + 1, + 1, + 50, + 106 + ], + "pad_token_id": 0, + "top_k": 64, + "top_p": 0.95, + "transformers_version": "5.7.0" +} diff --git a/checkpoint-897/model.safetensors b/checkpoint-897/model.safetensors new file mode 100644 index 0000000..c7ebd0e --- /dev/null +++ b/checkpoint-897/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efbb32f4990b6a076d86e5dc1b83f90236f9f459ac4cdb44d1847818f2d99171 +size 536223056 diff --git a/checkpoint-897/optimizer.pt b/checkpoint-897/optimizer.pt new file mode 100644 index 0000000..a884b53 --- /dev/null +++ b/checkpoint-897/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58b98cc8495901de6250e6e1addaf4bdf6122996dba38605f9fbd5512d915db8 +size 1072593978 diff --git a/checkpoint-897/rng_state.pth b/checkpoint-897/rng_state.pth new file mode 100644 index 0000000..33cefe6 --- /dev/null +++ b/checkpoint-897/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ff264f99d31b522cc7e2a4eac9d38606d0c58a34c0adc74d71e0ca8b371dc36 +size 14244 diff --git a/checkpoint-897/scheduler.pt b/checkpoint-897/scheduler.pt new file mode 100644 index 0000000..05b1561 --- /dev/null +++ b/checkpoint-897/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62894fde7f58d2738c7681577552abca250a7b16b270ec0708d33286c8de9398 +size 1064 diff --git a/checkpoint-897/structured_eval.json b/checkpoint-897/structured_eval.json new file mode 100644 index 0000000..9a897d4 --- /dev/null +++ b/checkpoint-897/structured_eval.json @@ -0,0 +1,423 @@ +{ + "step": 897, + "metrics": { + "n_total": 32, + "n_tool_call": 22, + "n_chat_only": 10, + "emit_rate": 1.0, + "schema_validity": 0.9090909090909091, + "escalation_correct": 1.0, + "argument_accuracy": 0.393939393939394, + "by_category": { + "turn_on_light": { + "n": 3, + "ok": 3, + "pct": 1.0 + }, + "turn_off_light": { + "n": 3, + "ok": 3, + "pct": 1.0 + }, + "dim_light": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "set_color": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "get_state": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "get_state_sensor": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "search_devices": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "list_in_area": { + "n": 2, + "ok": 0, + "pct": 0.0 + }, + "history_sensor": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "complex_reasoning": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "chat_greeting": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "chat_ack": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "chat_out_of_scope": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "chat_ambiguous": { + "n": 2, + "ok": 2, + "pct": 1.0 + }, + "chat_capability": { + "n": 2, + "ok": 2, + "pct": 1.0 + } + } + }, + "results": [ + { + "id": "tc_turn_on_light_001", + "category": "turn_on_light", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 0.6666666666666666, + "raw": "call:call_service{domain:light,entity_id:light.living_room_ceiling_light,service:turn_on}", + "latency_s": 1.08 + }, + { + "id": "tc_turn_on_light_002", + "category": "turn_on_light", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 0.6666666666666666, + "raw": "call:call_service{domain:light,entity_id:light.bedroom_ceiling_light,service:turn_on}", + "latency_s": 0.97 + }, + { + "id": "tc_turn_on_light_003", + "category": "turn_on_light", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 0.6666666666666666, + "raw": "call:call_service{domain:light,entity_id:light.desk_desk_downlight,service:turn_on}", + "latency_s": 1.0 + }, + { + "id": "tc_turn_off_light_001", + "category": "turn_off_light", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 1.0, + "raw": "call:call_service{domain:light,entity_id:light.kitchen_pendant,service:turn_off}", + "latency_s": 0.94 + }, + { + "id": "tc_turn_off_light_002", + "category": "turn_off_light", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 1.0, + "raw": "call:call_service{domain:light,entity_id:light.bedroom_ceiling_light,service:turn_off}", + "latency_s": 0.97 + }, + { + "id": "tc_turn_off_light_003", + "category": "turn_off_light", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 0.6666666666666666, + "raw": "call:call_service{domain:light,entity_id:light.kids_night_light,service:turn_off}", + "latency_s": 0.97 + }, + { + "id": "tc_dim_light_001", + "category": "dim_light", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:call_service{data:{brightness_pct:30},domain:light,entity_id:light.bedroom_lamp,service:turn_on}", + "latency_s": 1.15 + }, + { + "id": "tc_dim_light_002", + "category": "dim_light", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:call_service{data:{brightness_pct:70},domain:light,entity_id:light.kitchen_island,service:turn_on}", + "latency_s": 1.15 + }, + { + "id": "tc_set_color_001", + "category": "set_color", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:call_service{data:{rgb_color:[255,0,0]},domain:light,entity_id:light.living_room_main_light,service:turn_on}", + "latency_s": 1.39 + }, + { + "id": "tc_set_color_002", + "category": "set_color", + "type": "tool_call", + "emitted_call": true, + "called_name": "call_service", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:call_service{data:{rgb_color:[255,0,0]},domain:light,entity_id:light.bedroom_night_light,service:turn_on}", + "latency_s": 1.34 + }, + { + "id": "tc_get_state_001", + "category": "get_state", + "type": "tool_call", + "emitted_call": true, + "called_name": "get_state", + "schema_valid": true, + "argument_accuracy": 1.0, + "raw": "call:get_state{entity_id:switch.bedroom_fan}", + "latency_s": 0.54 + }, + { + "id": "tc_get_state_002", + "category": "get_state", + "type": "tool_call", + "emitted_call": true, + "called_name": "get_state", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:get_state{entity_id:climate.master_bedroom_mini_split}", + "latency_s": 0.65 + }, + { + "id": "tc_get_state_sensor_001", + "category": "get_state_sensor", + "type": "tool_call", + "emitted_call": true, + "called_name": "get_state", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:get_state{entity_id:sensor.living_room_temperature_sensor}", + "latency_s": 0.65 + }, + { + "id": "tc_get_state_sensor_002", + "category": "get_state_sensor", + "type": "tool_call", + "emitted_call": true, + "called_name": "get_state", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:get_state{entity_id:sensor.kitchen_humidity_sensor}", + "latency_s": 0.6 + }, + { + "id": "tc_search_001", + "category": "search_devices", + "type": "tool_call", + "emitted_call": true, + "called_name": "search_entities", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:search_entities{query:sensors}", + "latency_s": 0.38 + }, + { + "id": "tc_search_002", + "category": "search_devices", + "type": "tool_call", + "emitted_call": true, + "called_name": "search_entities", + "schema_valid": true, + "argument_accuracy": 1.0, + "raw": "call:search_entities{query:blinds}", + "latency_s": 0.41 + }, + { + "id": "tc_list_001", + "category": "list_in_area", + "type": "tool_call", + "emitted_call": true, + "called_name": "search_entities", + "schema_valid": false, + "argument_accuracy": 0.0, + "raw": "call:search_entities{query:lights}", + "latency_s": 0.38 + }, + { + "id": "tc_list_002", + "category": "list_in_area", + "type": "tool_call", + "emitted_call": true, + "called_name": "search_entities", + "schema_valid": false, + "argument_accuracy": 0.0, + "raw": "call:search_entities{query:climate}", + "latency_s": 0.38 + }, + { + "id": "tc_history_001", + "category": "history_sensor", + "type": "tool_call", + "emitted_call": true, + "called_name": "get_history", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:get_history{end_time:2026-05-02T00:00:00Z,entity_id:sensor.living_room_temperature_sensor,start_time:2026-05-01T00:00:00Z}", + "latency_s": 2.06 + }, + { + "id": "tc_history_002", + "category": "history_sensor", + "type": "tool_call", + "emitted_call": true, + "called_name": "get_history", + "schema_valid": true, + "argument_accuracy": 0.0, + "raw": "call:get_history{end_time:2026-05-02T00:00:00Z,entity_id:sensor.living_room_co2_sensor,start_time:2026-05-01T00:00:00Z}", + "latency_s": 2.12 + }, + { + "id": "tc_complex_001", + "category": "complex_reasoning", + "type": "tool_call", + "emitted_call": true, + "called_name": "escalate_to_sonnet", + "schema_valid": true, + "argument_accuracy": 1.0, + "raw": "call:escalate_to_sonnet{reason:complex_query}", + "latency_s": 0.55 + }, + { + "id": "tc_complex_002", + "category": "complex_reasoning", + "type": "tool_call", + "emitted_call": true, + "called_name": "escalate_to_sonnet", + "schema_valid": true, + "argument_accuracy": 1.0, + "raw": "call:escalate_to_sonnet{reason:complex_query}", + "latency_s": 0.54 + }, + { + "id": "co_greet_001", + "category": "chat_greeting", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "Hi! What can I do for you?", + "latency_s": 0.28 + }, + { + "id": "co_greet_002", + "category": "chat_greeting", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "Hello! How can I help with your smart home today?", + "latency_s": 0.36 + }, + { + "id": "co_ack_001", + "category": "chat_ack", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "Sure thing.", + "latency_s": 0.12 + }, + { + "id": "co_ack_002", + "category": "chat_ack", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "Anytime.", + "latency_s": 0.11 + }, + { + "id": "co_oos_001", + "category": "chat_out_of_scope", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "I can't help with that, but I can turn on lights, set scenes, or check what's going on around the house.", + "latency_s": 0.8 + }, + { + "id": "co_oos_002", + "category": "chat_out_of_scope", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "I'm focused on your smart home \u2014 for that one you'll want a different assistant.", + "latency_s": 0.56 + }, + { + "id": "co_amb_001", + "category": "chat_ambiguous", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "Sure \u2014 could you tell me a bit more about what you'd like? For example, I can turn on lights, set a thermostat, or check a sensor.", + "latency_s": 0.93 + }, + { + "id": "co_amb_002", + "category": "chat_ambiguous", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "I can do plenty around the house \u2014 could you point me at a specific room or device?", + "latency_s": 0.54 + }, + { + "id": "co_cap_001", + "category": "chat_capability", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "I control your smart home \u2014 lights, switches, climate, covers, media players \u2014 and I can read sensors and look up history. Just tell me what you want.", + "latency_s": 0.95 + }, + { + "id": "co_cap_002", + "category": "chat_capability", + "type": "chat_only", + "emitted_call": false, + "escalation_correct": true, + "raw": "I'm a smart home assistant connected to Home Assistant. I can turn devices on and off, dim lights, set thermostats, open and close covers, and read sensor data. Try asking me to turn on a light.", + "latency_s": 1.23 + } + ] +} \ No newline at end of file diff --git a/checkpoint-897/tokenizer.json b/checkpoint-897/tokenizer.json new file mode 100644 index 0000000..93a0904 --- /dev/null +++ b/checkpoint-897/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d7f800b949accd7eb940bac75e642f9468e4df157403032a55bf54ed23b650 +size 33384898 diff --git a/checkpoint-897/tokenizer_config.json b/checkpoint-897/tokenizer_config.json new file mode 100644 index 0000000..5a51e1f --- /dev/null +++ b/checkpoint-897/tokenizer_config.json @@ -0,0 +1,27 @@ +{ + "backend": "tokenizers", + "boi_token": "", + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eoi_token": "", + "eos_token": "", + "image_token": "", + "is_local": false, + "local_files_only": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "boi_token": "", + "eoi_token": "", + "image_token": "", + "sfr_token": "" + }, + "pad_token": "", + "padding_side": "left", + "sfr_token": "", + "sp_model_kwargs": null, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-897/trainer_state.json b/checkpoint-897/trainer_state.json new file mode 100644 index 0000000..07832c6 --- /dev/null +++ b/checkpoint-897/trainer_state.json @@ -0,0 +1,935 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 897, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.8498557328246534, + "epoch": 0.011154489682097044, + "grad_norm": 44.5, + "learning_rate": 1e-05, + "loss": 3.704855728149414, + "mean_token_accuracy": 0.5304131177254021, + "num_tokens": 833799.0, + "step": 10 + }, + { + "entropy": 1.765550174564123, + "epoch": 0.022308979364194088, + "grad_norm": 10.25, + "learning_rate": 2.111111111111111e-05, + "loss": 1.9117172241210938, + "mean_token_accuracy": 0.6469556039199233, + "num_tokens": 1671910.0, + "step": 20 + }, + { + "entropy": 0.8713466321351007, + "epoch": 0.03346346904629113, + "grad_norm": 3.78125, + "learning_rate": 3.222222222222223e-05, + "loss": 0.8604758262634278, + "mean_token_accuracy": 0.8343592453747988, + "num_tokens": 2510443.0, + "step": 30 + }, + { + "entropy": 0.23340068878605963, + "epoch": 0.044617958728388175, + "grad_norm": 1.953125, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.24464244842529298, + "mean_token_accuracy": 0.9471705831587315, + "num_tokens": 3345229.0, + "step": 40 + }, + { + "entropy": 0.16152286342112349, + "epoch": 0.05577244841048522, + "grad_norm": 3.171875, + "learning_rate": 4.9997280790439974e-05, + "loss": 0.17044107913970946, + "mean_token_accuracy": 0.9553312979638576, + "num_tokens": 4184871.0, + "step": 50 + }, + { + "entropy": 0.1448873324552551, + "epoch": 0.06692693809258227, + "grad_norm": 0.984375, + "learning_rate": 4.996669647581318e-05, + "loss": 0.14741255044937135, + "mean_token_accuracy": 0.9585917994379998, + "num_tokens": 5039068.0, + "step": 60 + }, + { + "entropy": 0.12383970170631073, + "epoch": 0.0780814277746793, + "grad_norm": 1.125, + "learning_rate": 4.990217055187362e-05, + "loss": 0.12599575519561768, + "mean_token_accuracy": 0.9621855434030294, + "num_tokens": 5904036.0, + "step": 70 + }, + { + "entropy": 0.11301726293168031, + "epoch": 0.08923591745677635, + "grad_norm": 1.3828125, + "learning_rate": 4.980379074002661e-05, + "loss": 0.11982399225234985, + "mean_token_accuracy": 0.964000066742301, + "num_tokens": 6747251.0, + "step": 80 + }, + { + "entropy": 0.10463761446881108, + "epoch": 0.1003904071388734, + "grad_norm": 0.71484375, + "learning_rate": 4.967169078520476e-05, + "loss": 0.11220132112503052, + "mean_token_accuracy": 0.9658373668789864, + "num_tokens": 7572769.0, + "step": 90 + }, + { + "entropy": 0.10532618285797071, + "epoch": 0.11154489682097044, + "grad_norm": 0.9296875, + "learning_rate": 4.9506050274045076e-05, + "loss": 0.11419826745986938, + "mean_token_accuracy": 0.965171106159687, + "num_tokens": 8410764.0, + "step": 100 + }, + { + "entropy": 0.10305561173590831, + "epoch": 0.12269938650306748, + "grad_norm": 0.89453125, + "learning_rate": 4.930709439074528e-05, + "loss": 0.10990087985992432, + "mean_token_accuracy": 0.9659170717000961, + "num_tokens": 9255104.0, + "step": 110 + }, + { + "entropy": 0.10211670671415049, + "epoch": 0.13385387618516453, + "grad_norm": 0.76953125, + "learning_rate": 4.90750936109315e-05, + "loss": 0.11037271022796631, + "mean_token_accuracy": 0.966075143776834, + "num_tokens": 10103000.0, + "step": 120 + }, + { + "entropy": 0.09717915701621678, + "epoch": 0.14500836586726157, + "grad_norm": 0.73046875, + "learning_rate": 4.881036333395329e-05, + "loss": 0.10295262336730956, + "mean_token_accuracy": 0.9677550513297319, + "num_tokens": 10945768.0, + "step": 130 + }, + { + "entropy": 0.09635820150724612, + "epoch": 0.1561628555493586, + "grad_norm": 0.7265625, + "learning_rate": 4.851326345410594e-05, + "loss": 0.10199121236801148, + "mean_token_accuracy": 0.9681327627971769, + "num_tokens": 11801129.0, + "step": 140 + }, + { + "entropy": 0.09002169943414629, + "epoch": 0.16731734523145567, + "grad_norm": 0.98046875, + "learning_rate": 4.818419787136311e-05, + "loss": 0.09567687511444092, + "mean_token_accuracy": 0.9701874911785126, + "num_tokens": 12636424.0, + "step": 150 + }, + { + "entropy": 0.09068954657413997, + "epoch": 0.1784718349135527, + "grad_norm": 1.1015625, + "learning_rate": 4.782361394228472e-05, + "loss": 0.0969263732433319, + "mean_token_accuracy": 0.9701515214517713, + "num_tokens": 13490177.0, + "step": 160 + }, + { + "entropy": 0.08610689677589108, + "epoch": 0.18962632459564974, + "grad_norm": 0.96484375, + "learning_rate": 4.74320018718467e-05, + "loss": 0.08986451625823974, + "mean_token_accuracy": 0.9720516135916114, + "num_tokens": 14328572.0, + "step": 170 + }, + { + "entropy": 0.08233372264367063, + "epoch": 0.2007808142777468, + "grad_norm": 1.890625, + "learning_rate": 4.700989404701941e-05, + "loss": 0.08586806058883667, + "mean_token_accuracy": 0.9734413396567106, + "num_tokens": 15189703.0, + "step": 180 + }, + { + "entropy": 0.07736555864394176, + "epoch": 0.21193530395984383, + "grad_norm": 1.28125, + "learning_rate": 4.6557864313000695e-05, + "loss": 0.07913717031478881, + "mean_token_accuracy": 0.9759283743798732, + "num_tokens": 16028746.0, + "step": 190 + }, + { + "entropy": 0.07013691037718672, + "epoch": 0.22308979364194087, + "grad_norm": 1.1953125, + "learning_rate": 4.60765271930874e-05, + "loss": 0.07151558399200439, + "mean_token_accuracy": 0.978206392377615, + "num_tokens": 16849567.0, + "step": 200 + }, + { + "entropy": 0.07075640709081199, + "epoch": 0.23424428332403793, + "grad_norm": 1.4921875, + "learning_rate": 4.55665370532461e-05, + "loss": 0.07073653936386108, + "mean_token_accuracy": 0.9784815656021237, + "num_tokens": 17705656.0, + "step": 210 + }, + { + "entropy": 0.06637019099725876, + "epoch": 0.24539877300613497, + "grad_norm": 1.28125, + "learning_rate": 4.5028587212518705e-05, + "loss": 0.06597371697425843, + "mean_token_accuracy": 0.979678837954998, + "num_tokens": 18565065.0, + "step": 220 + }, + { + "entropy": 0.05923618896049447, + "epoch": 0.25655326268823203, + "grad_norm": 1.2734375, + "learning_rate": 4.4463409000472234e-05, + "loss": 0.058509671688079835, + "mean_token_accuracy": 0.9822878390550613, + "num_tokens": 19392725.0, + "step": 230 + }, + { + "entropy": 0.05874249367916491, + "epoch": 0.26770775237032907, + "grad_norm": 1.171875, + "learning_rate": 4.3871770762974306e-05, + "loss": 0.05813463926315308, + "mean_token_accuracy": 0.9824939148500562, + "num_tokens": 20223583.0, + "step": 240 + }, + { + "entropy": 0.05689131756371353, + "epoch": 0.2788622420524261, + "grad_norm": 1.4609375, + "learning_rate": 4.325447681764586e-05, + "loss": 0.055121219158172606, + "mean_token_accuracy": 0.9830958772450685, + "num_tokens": 21085157.0, + "step": 250 + }, + { + "entropy": 0.05158787120017223, + "epoch": 0.29001673173452314, + "grad_norm": 1.0390625, + "learning_rate": 4.261236636041108e-05, + "loss": 0.05031982660293579, + "mean_token_accuracy": 0.984761236794293, + "num_tokens": 21904752.0, + "step": 260 + }, + { + "entropy": 0.05179886775440536, + "epoch": 0.30117122141662017, + "grad_norm": 0.78515625, + "learning_rate": 4.194631232463128e-05, + "loss": 0.0493065744638443, + "mean_token_accuracy": 0.9848343567922712, + "num_tokens": 22765981.0, + "step": 270 + }, + { + "entropy": 0.049823664524592456, + "epoch": 0.3123257110987172, + "grad_norm": 0.62890625, + "learning_rate": 4.1257220194373424e-05, + "loss": 0.04740493595600128, + "mean_token_accuracy": 0.9855156386271119, + "num_tokens": 23621926.0, + "step": 280 + }, + { + "entropy": 0.04758051415265072, + "epoch": 0.3234802007808143, + "grad_norm": 0.5, + "learning_rate": 4.054602677342684e-05, + "loss": 0.04637431204319, + "mean_token_accuracy": 0.9858794504776597, + "num_tokens": 24465607.0, + "step": 290 + }, + { + "entropy": 0.04668422270915471, + "epoch": 0.33463469046291133, + "grad_norm": 0.474609375, + "learning_rate": 3.981369891174155e-05, + "loss": 0.04507455825805664, + "mean_token_accuracy": 0.9860366908833385, + "num_tokens": 25299933.0, + "step": 300 + }, + { + "entropy": 0.04640703263867181, + "epoch": 0.34578918014500837, + "grad_norm": 0.390625, + "learning_rate": 3.906123219101952e-05, + "loss": 0.04516075849533081, + "mean_token_accuracy": 0.9859529983252286, + "num_tokens": 26154681.0, + "step": 310 + }, + { + "entropy": 0.04438853256579023, + "epoch": 0.3569436698271054, + "grad_norm": 0.423828125, + "learning_rate": 3.8289649571245885e-05, + "loss": 0.044096818566322325, + "mean_token_accuracy": 0.986466808244586, + "num_tokens": 26983541.0, + "step": 320 + }, + { + "entropy": 0.044660908347577785, + "epoch": 0.36809815950920244, + "grad_norm": 0.376953125, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.043841251730918886, + "mean_token_accuracy": 0.986228640563786, + "num_tokens": 27833414.0, + "step": 330 + }, + { + "entropy": 0.043841811595484614, + "epoch": 0.3792526491912995, + "grad_norm": 0.328125, + "learning_rate": 3.669335698643704e-05, + "loss": 0.04326000213623047, + "mean_token_accuracy": 0.9865183688700199, + "num_tokens": 28677577.0, + "step": 340 + }, + { + "entropy": 0.04371235728031024, + "epoch": 0.39040713887339656, + "grad_norm": 0.390625, + "learning_rate": 3.587081714187874e-05, + "loss": 0.043233224749565126, + "mean_token_accuracy": 0.9865481941029429, + "num_tokens": 29512895.0, + "step": 350 + }, + { + "entropy": 0.04327065504912753, + "epoch": 0.4015616285554936, + "grad_norm": 0.330078125, + "learning_rate": 3.503349868899722e-05, + "loss": 0.04288822710514069, + "mean_token_accuracy": 0.986665309779346, + "num_tokens": 30343018.0, + "step": 360 + }, + { + "entropy": 0.04256358170532622, + "epoch": 0.41271611823759063, + "grad_norm": 0.376953125, + "learning_rate": 3.418253994161892e-05, + "loss": 0.042832252383232114, + "mean_token_accuracy": 0.9867880517616868, + "num_tokens": 31158940.0, + "step": 370 + }, + { + "entropy": 0.04262932341953274, + "epoch": 0.42387060791968767, + "grad_norm": 0.30078125, + "learning_rate": 3.3319097757214843e-05, + "loss": 0.04222110211849213, + "mean_token_accuracy": 0.9866109801456332, + "num_tokens": 32008434.0, + "step": 380 + }, + { + "entropy": 0.04266308699734509, + "epoch": 0.4350250976017847, + "grad_norm": 0.314453125, + "learning_rate": 3.244434596418139e-05, + "loss": 0.042472487688064574, + "mean_token_accuracy": 0.9866344403475523, + "num_tokens": 32854651.0, + "step": 390 + }, + { + "entropy": 0.0429983379173791, + "epoch": 0.44617958728388174, + "grad_norm": 0.357421875, + "learning_rate": 3.155947376604948e-05, + "loss": 0.04324407577514648, + "mean_token_accuracy": 0.9865379808470607, + "num_tokens": 33705423.0, + "step": 400 + }, + { + "entropy": 0.04282604140753392, + "epoch": 0.45733407696597883, + "grad_norm": 0.294921875, + "learning_rate": 3.066568412479167e-05, + "loss": 0.04259026348590851, + "mean_token_accuracy": 0.9867103593423963, + "num_tokens": 34538632.0, + "step": 410 + }, + { + "entropy": 0.04212904951127712, + "epoch": 0.46848856664807587, + "grad_norm": 0.4140625, + "learning_rate": 2.976419212542495e-05, + "loss": 0.04252048432826996, + "mean_token_accuracy": 0.9867507757619023, + "num_tokens": 35381587.0, + "step": 420 + }, + { + "entropy": 0.0412595656584017, + "epoch": 0.4796430563301729, + "grad_norm": 0.3125, + "learning_rate": 2.885622332413256e-05, + "loss": 0.041145503520965576, + "mean_token_accuracy": 0.9870552903041243, + "num_tokens": 36234112.0, + "step": 430 + }, + { + "entropy": 0.041778112881002014, + "epoch": 0.49079754601226994, + "grad_norm": 0.349609375, + "learning_rate": 2.7943012082150533e-05, + "loss": 0.041335687041282654, + "mean_token_accuracy": 0.9867611099034548, + "num_tokens": 37077497.0, + "step": 440 + }, + { + "entropy": 0.04076959296362474, + "epoch": 0.501952035694367, + "grad_norm": 0.318359375, + "learning_rate": 2.7025799887684002e-05, + "loss": 0.041106203198432924, + "mean_token_accuracy": 0.9871867259964346, + "num_tokens": 37919261.0, + "step": 450 + }, + { + "entropy": 0.04188558856840245, + "epoch": 0.5131065253764641, + "grad_norm": 0.314453125, + "learning_rate": 2.6105833668134473e-05, + "loss": 0.041896390914916995, + "mean_token_accuracy": 0.9867892485111952, + "num_tokens": 38782505.0, + "step": 460 + }, + { + "entropy": 0.04178600409068167, + "epoch": 0.524261015058561, + "grad_norm": 0.3359375, + "learning_rate": 2.518436409493281e-05, + "loss": 0.04188077747821808, + "mean_token_accuracy": 0.9868596900254488, + "num_tokens": 39610893.0, + "step": 470 + }, + { + "entropy": 0.04152031776320655, + "epoch": 0.5354155047406581, + "grad_norm": 0.267578125, + "learning_rate": 2.426264388328214e-05, + "loss": 0.04174352586269379, + "mean_token_accuracy": 0.9868257040157914, + "num_tokens": 40427636.0, + "step": 480 + }, + { + "entropy": 0.040754605704569256, + "epoch": 0.5465699944227551, + "grad_norm": 0.3125, + "learning_rate": 2.334192608912241e-05, + "loss": 0.04108997285366058, + "mean_token_accuracy": 0.9870152780786157, + "num_tokens": 41252001.0, + "step": 490 + }, + { + "entropy": 0.042179943548399025, + "epoch": 0.5577244841048522, + "grad_norm": 0.353515625, + "learning_rate": 2.2423462405631616e-05, + "loss": 0.04207477867603302, + "mean_token_accuracy": 0.9866394894197583, + "num_tokens": 42113694.0, + "step": 500 + }, + { + "entropy": 0.04122589101898484, + "epoch": 0.5688789737869493, + "grad_norm": 0.330078125, + "learning_rate": 2.150850146157985e-05, + "loss": 0.04146281182765961, + "mean_token_accuracy": 0.9869526766240597, + "num_tokens": 42941138.0, + "step": 510 + }, + { + "entropy": 0.04114197726012207, + "epoch": 0.5800334634690463, + "grad_norm": 0.345703125, + "learning_rate": 2.0598287123849095e-05, + "loss": 0.040973353385925296, + "mean_token_accuracy": 0.9871221456676722, + "num_tokens": 43770822.0, + "step": 520 + }, + { + "entropy": 0.04192428553069476, + "epoch": 0.5911879531511434, + "grad_norm": 0.302734375, + "learning_rate": 1.9694056806426928e-05, + "loss": 0.04169855713844299, + "mean_token_accuracy": 0.9866715084761382, + "num_tokens": 44637866.0, + "step": 530 + }, + { + "entropy": 0.03977415001136251, + "epoch": 0.6023424428332403, + "grad_norm": 0.314453125, + "learning_rate": 1.879703978817256e-05, + "loss": 0.04036850333213806, + "mean_token_accuracy": 0.9872556058689952, + "num_tokens": 45453923.0, + "step": 540 + }, + { + "entropy": 0.04234540155448485, + "epoch": 0.6134969325153374, + "grad_norm": 0.33203125, + "learning_rate": 1.7908455541642584e-05, + "loss": 0.04180983603000641, + "mean_token_accuracy": 0.9865613304078579, + "num_tokens": 46306551.0, + "step": 550 + }, + { + "entropy": 0.041263596300268546, + "epoch": 0.6246514221974344, + "grad_norm": 0.31640625, + "learning_rate": 1.7029512075247967e-05, + "loss": 0.04135525822639465, + "mean_token_accuracy": 0.986899808421731, + "num_tokens": 47143518.0, + "step": 560 + }, + { + "entropy": 0.04119314953277353, + "epoch": 0.6358059118795315, + "grad_norm": 0.3203125, + "learning_rate": 1.6161404290996412e-05, + "loss": 0.04146760106086731, + "mean_token_accuracy": 0.9868535120040178, + "num_tokens": 47992113.0, + "step": 570 + }, + { + "entropy": 0.04120078657870181, + "epoch": 0.6469604015616286, + "grad_norm": 0.30859375, + "learning_rate": 1.5305312360052442e-05, + "loss": 0.0413068950176239, + "mean_token_accuracy": 0.986842698045075, + "num_tokens": 48843712.0, + "step": 580 + }, + { + "entropy": 0.04128208919428289, + "epoch": 0.6581148912437256, + "grad_norm": 0.326171875, + "learning_rate": 1.4462400118323798e-05, + "loss": 0.041500210762023926, + "mean_token_accuracy": 0.9869369497522712, + "num_tokens": 49688129.0, + "step": 590 + }, + { + "entropy": 0.04088454471202567, + "epoch": 0.6692693809258227, + "grad_norm": 0.33203125, + "learning_rate": 1.3633813484255131e-05, + "loss": 0.041133826971054076, + "mean_token_accuracy": 0.9869873868301511, + "num_tokens": 50520741.0, + "step": 600 + }, + { + "entropy": 0.04227327090920881, + "epoch": 0.6804238706079196, + "grad_norm": 0.275390625, + "learning_rate": 1.2820678900980093e-05, + "loss": 0.04190162420272827, + "mean_token_accuracy": 0.9865590412169695, + "num_tokens": 51392294.0, + "step": 610 + }, + { + "entropy": 0.04065559499140363, + "epoch": 0.6915783602900167, + "grad_norm": 0.328125, + "learning_rate": 1.2024101804949638e-05, + "loss": 0.04115171730518341, + "mean_token_accuracy": 0.9869800698012113, + "num_tokens": 52240430.0, + "step": 620 + }, + { + "entropy": 0.042412614575005135, + "epoch": 0.7027328499721138, + "grad_norm": 0.3203125, + "learning_rate": 1.124516512311836e-05, + "loss": 0.04237264692783356, + "mean_token_accuracy": 0.9865791719406843, + "num_tokens": 53087953.0, + "step": 630 + }, + { + "entropy": 0.04051031620183494, + "epoch": 0.7138873396542108, + "grad_norm": 0.28515625, + "learning_rate": 1.0484927800731984e-05, + "loss": 0.040881377458572385, + "mean_token_accuracy": 0.9869993204250932, + "num_tokens": 53927989.0, + "step": 640 + }, + { + "entropy": 0.04193990352796391, + "epoch": 0.7250418293363079, + "grad_norm": 0.298828125, + "learning_rate": 9.744423361717323e-06, + "loss": 0.04187402129173279, + "mean_token_accuracy": 0.9866631610319019, + "num_tokens": 54774541.0, + "step": 650 + }, + { + "entropy": 0.03943951329856645, + "epoch": 0.7361963190184049, + "grad_norm": 0.3125, + "learning_rate": 9.024658503631967e-06, + "loss": 0.04017325043678284, + "mean_token_accuracy": 0.9874097904190421, + "num_tokens": 55607613.0, + "step": 660 + }, + { + "entropy": 0.041415746443090026, + "epoch": 0.747350808700502, + "grad_norm": 0.296875, + "learning_rate": 8.32661172908373e-06, + "loss": 0.04164916574954987, + "mean_token_accuracy": 0.9868578946217894, + "num_tokens": 56444089.0, + "step": 670 + }, + { + "entropy": 0.04108071085065603, + "epoch": 0.758505298382599, + "grad_norm": 0.265625, + "learning_rate": 7.651232015480462e-06, + "loss": 0.04107390642166138, + "mean_token_accuracy": 0.986830660328269, + "num_tokens": 57290368.0, + "step": 680 + }, + { + "entropy": 0.04229205273441039, + "epoch": 0.769659788064696, + "grad_norm": 0.306640625, + "learning_rate": 6.99943752491857e-06, + "loss": 0.04177336990833282, + "mean_token_accuracy": 0.9866125296801329, + "num_tokens": 58128968.0, + "step": 690 + }, + { + "entropy": 0.041548075363971294, + "epoch": 0.7808142777467931, + "grad_norm": 0.279296875, + "learning_rate": 6.372114355964293e-06, + "loss": 0.04167112410068512, + "mean_token_accuracy": 0.9867573702707887, + "num_tokens": 58984460.0, + "step": 700 + }, + { + "entropy": 0.0403200296277646, + "epoch": 0.7919687674288901, + "grad_norm": 0.306640625, + "learning_rate": 5.770115339024484e-06, + "loss": 0.04050106704235077, + "mean_token_accuracy": 0.9871903322637081, + "num_tokens": 59827367.0, + "step": 710 + }, + { + "entropy": 0.0406710500101326, + "epoch": 0.8031232571109872, + "grad_norm": 0.294921875, + "learning_rate": 5.194258876944705e-06, + "loss": 0.04084862470626831, + "mean_token_accuracy": 0.9871157312765717, + "num_tokens": 60654050.0, + "step": 720 + }, + { + "entropy": 0.040411298532853836, + "epoch": 0.8142777467930842, + "grad_norm": 0.28515625, + "learning_rate": 4.645327832410648e-06, + "loss": 0.040474030375480655, + "mean_token_accuracy": 0.9871165057644248, + "num_tokens": 61488530.0, + "step": 730 + }, + { + "entropy": 0.04196117307874374, + "epoch": 0.8254322364751813, + "grad_norm": 0.310546875, + "learning_rate": 4.12406846366562e-06, + "loss": 0.04189785122871399, + "mean_token_accuracy": 0.9867719961330295, + "num_tokens": 62326744.0, + "step": 740 + }, + { + "entropy": 0.040386362894787455, + "epoch": 0.8365867261572784, + "grad_norm": 0.2734375, + "learning_rate": 3.631189409990815e-06, + "loss": 0.04039705097675324, + "mean_token_accuracy": 0.9871017251163721, + "num_tokens": 63170753.0, + "step": 750 + }, + { + "entropy": 0.040684799235896206, + "epoch": 0.8477412158393753, + "grad_norm": 0.265625, + "learning_rate": 3.1673607283276813e-06, + "loss": 0.04109015464782715, + "mean_token_accuracy": 0.9869557719677686, + "num_tokens": 64013316.0, + "step": 760 + }, + { + "entropy": 0.042057951152673925, + "epoch": 0.8588957055214724, + "grad_norm": 0.275390625, + "learning_rate": 2.733212982351957e-06, + "loss": 0.04174878001213074, + "mean_token_accuracy": 0.986495653167367, + "num_tokens": 64863074.0, + "step": 770 + }, + { + "entropy": 0.04044588297838345, + "epoch": 0.8700501952035694, + "grad_norm": 0.283203125, + "learning_rate": 2.3293363852379125e-06, + "loss": 0.04043938219547272, + "mean_token_accuracy": 0.9872395290061832, + "num_tokens": 65709101.0, + "step": 780 + }, + { + "entropy": 0.04246396276575979, + "epoch": 0.8812046848856665, + "grad_norm": 0.275390625, + "learning_rate": 1.956279997278043e-06, + "loss": 0.041996100544929506, + "mean_token_accuracy": 0.9866539994254708, + "num_tokens": 66553598.0, + "step": 790 + }, + { + "entropy": 0.041515190360951235, + "epoch": 0.8923591745677635, + "grad_norm": 0.322265625, + "learning_rate": 1.6145509794491364e-06, + "loss": 0.041551712155342105, + "mean_token_accuracy": 0.9867776447907091, + "num_tokens": 67405428.0, + "step": 800 + }, + { + "entropy": 0.04269145799044054, + "epoch": 0.9035136642498606, + "grad_norm": 0.283203125, + "learning_rate": 1.3046139039394e-06, + "loss": 0.042556726932525636, + "mean_token_accuracy": 0.9866596391424537, + "num_tokens": 68245631.0, + "step": 810 + }, + { + "entropy": 0.04082234081579372, + "epoch": 0.9146681539319577, + "grad_norm": 0.30859375, + "learning_rate": 1.026890122573998e-06, + "loss": 0.04080590307712555, + "mean_token_accuracy": 0.987078714184463, + "num_tokens": 69069024.0, + "step": 820 + }, + { + "entropy": 0.04289137564774137, + "epoch": 0.9258226436140546, + "grad_norm": 0.29296875, + "learning_rate": 7.817571939976288e-07, + "loss": 0.04283967912197113, + "mean_token_accuracy": 0.9864464558660984, + "num_tokens": 69903803.0, + "step": 830 + }, + { + "entropy": 0.041589401010423896, + "epoch": 0.9369771332961517, + "grad_norm": 0.2890625, + "learning_rate": 5.695483703928306e-07, + "loss": 0.04148242473602295, + "mean_token_accuracy": 0.9868634788319468, + "num_tokens": 70747562.0, + "step": 840 + }, + { + "entropy": 0.04105772517505102, + "epoch": 0.9481316229782487, + "grad_norm": 0.271484375, + "learning_rate": 3.905521444318605e-07, + "loss": 0.04133652150630951, + "mean_token_accuracy": 0.986898991279304, + "num_tokens": 71601623.0, + "step": 850 + }, + { + "entropy": 0.040625668261782266, + "epoch": 0.9592861126603458, + "grad_norm": 0.322265625, + "learning_rate": 2.450118570779786e-07, + "loss": 0.04106319844722748, + "mean_token_accuracy": 0.9870263114571571, + "num_tokens": 72449591.0, + "step": 860 + }, + { + "entropy": 0.041737568736425604, + "epoch": 0.9704406023424428, + "grad_norm": 0.28515625, + "learning_rate": 1.3312536676942377e-07, + "loss": 0.04145269989967346, + "mean_token_accuracy": 0.9866915429010987, + "num_tokens": 73305562.0, + "step": 870 + }, + { + "entropy": 0.04121337772812694, + "epoch": 0.9815950920245399, + "grad_norm": 0.28515625, + "learning_rate": 5.5044780435722923e-08, + "loss": 0.04093064665794373, + "mean_token_accuracy": 0.986900057643652, + "num_tokens": 74143208.0, + "step": 880 + }, + { + "entropy": 0.040503930338309145, + "epoch": 0.992749581706637, + "grad_norm": 0.287109375, + "learning_rate": 1.0876246712074322e-08, + "loss": 0.0403281182050705, + "mean_token_accuracy": 0.9871442951261997, + "num_tokens": 74991508.0, + "step": 890 + }, + { + "epoch": 1.0, + "eval_entropy": 0.041321987748146057, + "eval_loss": 0.03999410942196846, + "eval_mean_token_accuracy": 0.9868998980522156, + "eval_num_tokens": 75541075.0, + "eval_runtime": 50.0483, + "eval_samples_per_second": 19.981, + "eval_steps_per_second": 9.99, + "step": 897 + } + ], + "logging_steps": 10, + "max_steps": 897, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.495254912950835e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-897/training_args.bin b/checkpoint-897/training_args.bin new file mode 100644 index 0000000..8c58fec --- /dev/null +++ b/checkpoint-897/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf09ccdfa77c381d3289f0527036b2ded899e83c480f73afe09a2e9f4202aaef +size 5368 diff --git a/config.json b/config.json new file mode 100644 index 0000000..662be51 --- /dev/null +++ b/config.json @@ -0,0 +1,62 @@ +{ + "_sliding_window_pattern": 6, + "architectures": [ + "Gemma3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attn_logit_softcapping": null, + "bos_token_id": 2, + "dtype": "bfloat16", + "eos_token_id": 1, + "final_logit_softcapping": null, + "head_dim": 256, + "hidden_activation": "gelu_pytorch_tanh", + "hidden_size": 640, + "initializer_range": 0.02, + "intermediate_size": 2048, + "layer_types": [ + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "model_type": "gemma3_text", + "num_attention_heads": 4, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "pad_token_id": 0, + "query_pre_attn_scalar": 256, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "full_attention": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_attention": { + "rope_theta": 10000.0, + "rope_type": "default" + } + }, + "sliding_window": 512, + "tie_word_embeddings": true, + "transformers_version": "5.7.0", + "use_bidirectional_attention": false, + "use_cache": false, + "vocab_size": 262144 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..69cb412 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,15 @@ +{ + "bos_token_id": 2, + "cache_implementation": "hybrid", + "do_sample": true, + "eos_token_id": [ + 1, + 1, + 50, + 106 + ], + "pad_token_id": 0, + "top_k": 64, + "top_p": 0.95, + "transformers_version": "5.7.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..c7ebd0e --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efbb32f4990b6a076d86e5dc1b83f90236f9f459ac4cdb44d1847818f2d99171 +size 536223056 diff --git a/nives-fg-270m-v1-F16.gguf b/nives-fg-270m-v1-F16.gguf new file mode 100644 index 0000000..96e962c --- /dev/null +++ b/nives-fg-270m-v1-F16.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26e0a65d5714008b9c8158a2b79d25de02a2adf05b4c0856ae6e3c191f2e9d31 +size 542848640 diff --git a/nives-fg-270m-v1-Q4_K_M.gguf b/nives-fg-270m-v1-Q4_K_M.gguf new file mode 100644 index 0000000..a838f91 --- /dev/null +++ b/nives-fg-270m-v1-Q4_K_M.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36263ceda5f394b030cfc8415af615216cb11b76124a9ca25bcf1706f38ced0e +size 253128320 diff --git a/runs/May05_17-50-33_d47cde4b1ec4/events.out.tfevents.1778003433.d47cde4b1ec4.11675.0 b/runs/May05_17-50-33_d47cde4b1ec4/events.out.tfevents.1778003433.d47cde4b1ec4.11675.0 new file mode 100644 index 0000000..bd503b8 --- /dev/null +++ b/runs/May05_17-50-33_d47cde4b1ec4/events.out.tfevents.1778003433.d47cde4b1ec4.11675.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f048174a168bace4a11eb831c560e96793f0f1f751390d8ca21fea146f6379b7 +size 40488 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..93a0904 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d7f800b949accd7eb940bac75e642f9468e4df157403032a55bf54ed23b650 +size 33384898 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..5a51e1f --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,27 @@ +{ + "backend": "tokenizers", + "boi_token": "", + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eoi_token": "", + "eos_token": "", + "image_token": "", + "is_local": false, + "local_files_only": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "boi_token": "", + "eoi_token": "", + "image_token": "", + "sfr_token": "" + }, + "pad_token": "", + "padding_side": "left", + "sfr_token": "", + "sp_model_kwargs": null, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..8c58fec --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf09ccdfa77c381d3289f0527036b2ded899e83c480f73afe09a2e9f4202aaef +size 5368