初始化项目，由ModelHub XC社区提供模型

Model: heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 Source: Original Platform
2026-06-16 08:16:17 +08:00
commit d5d0e722af
93 changed files with 4106 additions and 0 deletions
--- a/checkpoint-125/chat_template.jinja
+++ b/checkpoint-125/chat_template.jinja
@@ -0,0 +1,89 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}
--- a/checkpoint-125/config.json
+++ b/checkpoint-125/config.json
@@ -0,0 +1,63 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
--- a/checkpoint-125/generation_config.json
+++ b/checkpoint-125/generation_config.json
@@ -0,0 +1,12 @@
+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.6.2"
+}
--- a/checkpoint-125/model.safetensors
+++ b/checkpoint-125/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cfa4edd146d430b8bbff0802fd8c777682b57609ee370304d76982a5919498b
+size 2384234968
--- a/checkpoint-125/optimizer.pt
+++ b/checkpoint-125/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ca431c190b97d71d34b7476e9c0b446b1f7a0854090fdc7233f7e708432c5f4
+size 4768669395
--- a/checkpoint-125/rng_state.pth
+++ b/checkpoint-125/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce9d06b6c01a6abb92a5b52d5d361d96821b8e35f68158be1830efadebe0f334
+size 14645
--- a/checkpoint-125/scheduler.pt
+++ b/checkpoint-125/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e084e51a71375a2e59178fefd4967d7cdc87b72fa22758078088a60489f32adb
+size 1465
--- a/checkpoint-125/tokenizer.json
+++ b/checkpoint-125/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
--- a/checkpoint-125/tokenizer_config.json
+++ b/checkpoint-125/tokenizer_config.json
@@ -0,0 +1,78 @@
+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "local_files_only": false,
+  "max_length": 3072,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "reasoning_content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object",
+          "x-parser": "json",
+          "x-parser-args": {
+            "transform": "{type: 'function', function: @}"
+          }
+        },
+        "type": "array",
+        "x-regex-iterator": "<tool_call>\\s*(.+?)\\s*</tool_call>"
+      }
+    },
+    "type": "object",
+    "x-regex": "^(?:<think>\\n?(?:(?P<reasoning_content>.*?\\S.*?)\\n?|[\\s]*)</think>\\s*)?(?P<content>.*?)(?:\\n(?=<tool_call>))?(?=(?:<tool_call>|<\\|im_end\\|>|$))(?P<tool_calls>(?:<tool_call>.+?</tool_call>\\s*)+)?\\s*(?:<\\|im_end\\|>|$)"
+  },
+  "split_special_tokens": false,
+  "stride": 0,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "truncation_strategy": "longest_first",
+  "unk_token": null
+}
--- a/checkpoint-125/trainer_state.json
+++ b/checkpoint-125/trainer_state.json
@@ -0,0 +1,784 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.78125,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6,
+      "completions/max_length": 199.6,
+      "completions/max_terminated_length": 134.0,
+      "completions/mean_length": 171.9,
+      "completions/mean_terminated_length": 122.36666870117188,
+      "completions/min_length": 113.0,
+      "completions/min_terminated_length": 113.0,
+      "entropy": 0.06970996516756714,
+      "epoch": 0.03125,
+      "frac_reward_zero_std": 0.6,
+      "grad_norm": 3.630038261413574,
+      "kl": 0.00014932112862879875,
+      "learning_rate": 4.92e-06,
+      "loss": 0.029165178537368774,
+      "num_tokens": 15758.0,
+      "reward": -0.31389998495578764,
+      "reward_std": 0.2122000053524971,
+      "rewards/reward_func/mean": -0.31389998495578764,
+      "rewards/reward_func/std": 0.21219999492168426,
+      "step": 5,
+      "step_time": 14.728857926794444,
+      "tools/call_frequency": 3.45,
+      "tools/failure_frequency": 0.21573015451431274
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 168.6,
+      "completions/max_terminated_length": 168.6,
+      "completions/mean_length": 148.3,
+      "completions/mean_terminated_length": 148.3,
+      "completions/min_length": 129.6,
+      "completions/min_terminated_length": 129.6,
+      "entropy": 0.042718362715095284,
+      "epoch": 0.0625,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 3.325033187866211,
+      "kl": 0.037860750965774057,
+      "learning_rate": 4.8200000000000004e-06,
+      "loss": -0.011221970617771148,
+      "num_tokens": 31053.0,
+      "reward": 0.2989000082015991,
+      "reward_std": 0.4415143087506294,
+      "rewards/reward_func/mean": 0.2989000082015991,
+      "rewards/reward_func/std": 0.4415143221616745,
+      "step": 10,
+      "step_time": 9.975367512006779,
+      "tools/call_frequency": 2.5,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 152.8,
+      "completions/max_terminated_length": 152.8,
+      "completions/mean_length": 131.3,
+      "completions/mean_terminated_length": 131.3,
+      "completions/min_length": 113.6,
+      "completions/min_terminated_length": 113.6,
+      "entropy": 0.016039706021547317,
+      "epoch": 0.09375,
+      "frac_reward_zero_std": 0.4,
+      "grad_norm": 1.1289054155349731,
+      "kl": 0.06640795171260834,
+      "learning_rate": 4.7200000000000005e-06,
+      "loss": 0.04752160608768463,
+      "num_tokens": 45857.0,
+      "reward": 1.1023000121116637,
+      "reward_std": 0.4320605039596558,
+      "rewards/reward_func/mean": 1.1023000121116637,
+      "rewards/reward_func/std": 0.43206052780151366,
+      "step": 15,
+      "step_time": 8.620344271202338,
+      "tools/call_frequency": 2.35,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1,
+      "completions/max_length": 172.0,
+      "completions/max_terminated_length": 172.0,
+      "completions/mean_length": 136.55,
+      "completions/mean_terminated_length": 137.2500030517578,
+      "completions/min_length": 111.4,
+      "completions/min_terminated_length": 111.4,
+      "entropy": 0.027425602450966834,
+      "epoch": 0.125,
+      "frac_reward_zero_std": 0.6,
+      "grad_norm": 0.8991426229476929,
+      "kl": 0.09577701878733932,
+      "learning_rate": 4.620000000000001e-06,
+      "loss": -0.1201351523399353,
+      "num_tokens": 60826.0,
+      "reward": 0.7200000047683716,
+      "reward_std": 0.3419178485870361,
+      "rewards/reward_func/mean": 0.7200000047683716,
+      "rewards/reward_func/std": 0.3419178485870361,
+      "step": 20,
+      "step_time": 11.403528443601681,
+      "tools/call_frequency": 2.05,
+      "tools/failure_frequency": 0.026666668057441712
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.35,
+      "completions/max_length": 203.2,
+      "completions/max_terminated_length": 161.2,
+      "completions/mean_length": 188.6,
+      "completions/mean_terminated_length": 150.6666687011719,
+      "completions/min_length": 173.4,
+      "completions/min_terminated_length": 139.6,
+      "entropy": 0.033282498246990144,
+      "epoch": 0.15625,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.041987657546997,
+      "kl": 0.051508421916514634,
+      "learning_rate": 4.520000000000001e-06,
+      "loss": 0.03198407888412476,
+      "num_tokens": 76838.0,
+      "reward": 1.2669333696365357,
+      "reward_std": 0.3234894543886185,
+      "rewards/reward_func/mean": 1.2669333696365357,
+      "rewards/reward_func/std": 0.32348946332931516,
+      "step": 25,
+      "step_time": 13.736867211584467,
+      "tools/call_frequency": 4.4,
+      "tools/failure_frequency": 0.14583333432674409
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65,
+      "completions/max_length": 209.4,
+      "completions/max_terminated_length": 160.8,
+      "completions/mean_length": 192.45,
+      "completions/mean_terminated_length": 148.3,
+      "completions/min_length": 169.2,
+      "completions/min_terminated_length": 135.8,
+      "entropy": 0.04025774166220799,
+      "epoch": 0.1875,
+      "frac_reward_zero_std": 0.4,
+      "grad_norm": 1.6383038759231567,
+      "kl": 0.09242036554496735,
+      "learning_rate": 4.42e-06,
+      "loss": -0.03659022152423859,
+      "num_tokens": 93054.0,
+      "reward": 1.0333500146865844,
+      "reward_std": 0.38981522917747496,
+      "rewards/reward_func/mean": 1.0333500146865844,
+      "rewards/reward_func/std": 0.389815217256546,
+      "step": 30,
+      "step_time": 14.735964270806289,
+      "tools/call_frequency": 3.85,
+      "tools/failure_frequency": 0.023529411852359773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7,
+      "completions/max_length": 207.4,
+      "completions/max_terminated_length": 168.8,
+      "completions/mean_length": 196.4,
+      "completions/mean_terminated_length": 166.7,
+      "completions/min_length": 191.2,
+      "completions/min_terminated_length": 164.6,
+      "entropy": 0.02646293715806678,
+      "epoch": 0.21875,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 0.6842532157897949,
+      "kl": 0.09354882184416055,
+      "learning_rate": 4.32e-06,
+      "loss": 0.014650090038776398,
+      "num_tokens": 109141.0,
+      "reward": 1.0134333491325378,
+      "reward_std": 0.28623148798942566,
+      "rewards/reward_func/mean": 1.0134333491325378,
+      "rewards/reward_func/std": 0.2862314820289612,
+      "step": 35,
+      "step_time": 14.25194917320332,
+      "tools/call_frequency": 3.95,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7,
+      "completions/max_length": 228.8,
+      "completions/max_terminated_length": 163.6,
+      "completions/mean_length": 207.8,
+      "completions/mean_terminated_length": 162.3,
+      "completions/min_length": 188.8,
+      "completions/min_terminated_length": 161.0,
+      "entropy": 0.049686831969302146,
+      "epoch": 0.25,
+      "frac_reward_zero_std": 0.4,
+      "grad_norm": 2.386836528778076,
+      "kl": 0.12552661653608083,
+      "learning_rate": 4.22e-06,
+      "loss": 0.023246073722839357,
+      "num_tokens": 125712.0,
+      "reward": 0.9764333426952362,
+      "reward_std": 0.3545127585530281,
+      "rewards/reward_func/mean": 0.9764333426952362,
+      "rewards/reward_func/std": 0.35451277494430544,
+      "step": 40,
+      "step_time": 16.735324517198023,
+      "tools/call_frequency": 3.45,
+      "tools/failure_frequency": 0.02857142984867096
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8,
+      "completions/max_length": 231.4,
+      "completions/max_terminated_length": 127.2,
+      "completions/mean_length": 209.4,
+      "completions/mean_terminated_length": 124.2,
+      "completions/min_length": 187.2,
+      "completions/min_terminated_length": 121.2,
+      "entropy": 0.14096241008955984,
+      "epoch": 0.28125,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 5.072839260101318,
+      "kl": 0.10897002797573804,
+      "learning_rate": 4.12e-06,
+      "loss": 0.05337468385696411,
+      "num_tokens": 142131.0,
+      "reward": 1.0291000008583069,
+      "reward_std": 0.5297403573989868,
+      "rewards/reward_func/mean": 1.0291000008583069,
+      "rewards/reward_func/std": 0.5297403573989868,
+      "step": 45,
+      "step_time": 17.371078941601446,
+      "tools/call_frequency": 3.4,
+      "tools/failure_frequency": 0.01428571492433548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6,
+      "completions/max_length": 237.2,
+      "completions/max_terminated_length": 172.2,
+      "completions/mean_length": 201.7,
+      "completions/mean_terminated_length": 160.46666870117187,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 150.8,
+      "entropy": 0.1540619947016239,
+      "epoch": 0.3125,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 5.2555952072143555,
+      "kl": 0.17082785218954086,
+      "learning_rate": 4.0200000000000005e-06,
+      "loss": 0.06733548641204834,
+      "num_tokens": 158431.0,
+      "reward": 0.8427666783332824,
+      "reward_std": 0.6860074520111084,
+      "rewards/reward_func/mean": 0.8427666783332824,
+      "rewards/reward_func/std": 0.6860074281692505,
+      "step": 50,
+      "step_time": 17.60776922639343,
+      "tools/call_frequency": 3.6,
+      "tools/failure_frequency": 0.027619048953056335
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.55,
+      "completions/max_length": 209.2,
+      "completions/max_terminated_length": 209.2,
+      "completions/mean_length": 198.0,
+      "completions/mean_terminated_length": 206.43333435058594,
+      "completions/min_length": 189.4,
+      "completions/min_terminated_length": 203.8,
+      "entropy": 0.11417091116309167,
+      "epoch": 0.34375,
+      "frac_reward_zero_std": 0.8,
+      "grad_norm": 0.1612984985113144,
+      "kl": 0.14481508396565915,
+      "learning_rate": 3.920000000000001e-06,
+      "loss": -0.0013940947130322457,
+      "num_tokens": 174665.0,
+      "reward": 1.337833333015442,
+      "reward_std": 0.04058598577976227,
+      "rewards/reward_func/mean": 1.337833333015442,
+      "rewards/reward_func/std": 0.04058598577976227,
+      "step": 55,
+      "step_time": 13.894916865596315,
+      "tools/call_frequency": 3.9,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.35,
+      "completions/max_length": 210.6,
+      "completions/max_terminated_length": 210.0,
+      "completions/mean_length": 189.7,
+      "completions/mean_terminated_length": 184.93333435058594,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 153.4,
+      "entropy": 0.18207021439447998,
+      "epoch": 0.375,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 7.57163667678833,
+      "kl": 0.2769763808697462,
+      "learning_rate": 3.820000000000001e-06,
+      "loss": -0.08738029599189759,
+      "num_tokens": 190974.0,
+      "reward": 0.9539999723434448,
+      "reward_std": 0.24900673925876618,
+      "rewards/reward_func/mean": 0.9539999723434448,
+      "rewards/reward_func/std": 0.2490067459642887,
+      "step": 60,
+      "step_time": 13.735741792595945,
+      "tools/call_frequency": 3.35,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.55,
+      "completions/max_length": 213.6,
+      "completions/max_terminated_length": 186.6,
+      "completions/mean_length": 196.15,
+      "completions/mean_terminated_length": 181.2,
+      "completions/min_length": 173.2,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.18931779703125357,
+      "epoch": 0.40625,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 0.3368631601333618,
+      "kl": 0.19928277991712093,
+      "learning_rate": 3.7200000000000004e-06,
+      "loss": -0.03082091510295868,
+      "num_tokens": 207221.0,
+      "reward": 1.1948333382606506,
+      "reward_std": 0.3531351625919342,
+      "rewards/reward_func/mean": 1.1948333382606506,
+      "rewards/reward_func/std": 0.3531351566314697,
+      "step": 65,
+      "step_time": 14.853071747999639,
+      "tools/call_frequency": 3.45,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.45,
+      "completions/max_length": 232.4,
+      "completions/max_terminated_length": 231.4,
+      "completions/mean_length": 214.8,
+      "completions/mean_terminated_length": 221.10000305175782,
+      "completions/min_length": 199.4,
+      "completions/min_terminated_length": 211.6,
+      "entropy": 0.20331259737722576,
+      "epoch": 0.4375,
+      "frac_reward_zero_std": 0.6,
+      "grad_norm": 3.155299663543701,
+      "kl": 0.21616111248731612,
+      "learning_rate": 3.62e-06,
+      "loss": -0.014388753473758698,
+      "num_tokens": 223949.0,
+      "reward": 1.187999999523163,
+      "reward_std": 0.06400000005960464,
+      "rewards/reward_func/mean": 1.187999999523163,
+      "rewards/reward_func/std": 0.06399999856948853,
+      "step": 70,
+      "step_time": 15.7972018689994,
+      "tools/call_frequency": 3.3,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 228.0,
+      "completions/max_terminated_length": 180.2,
+      "completions/mean_length": 217.2,
+      "completions/mean_terminated_length": 174.9,
+      "completions/min_length": 208.2,
+      "completions/min_terminated_length": 169.6,
+      "entropy": 0.09874274502508343,
+      "epoch": 0.46875,
+      "frac_reward_zero_std": 0.4,
+      "grad_norm": 0.1496252417564392,
+      "kl": 0.19251887053251265,
+      "learning_rate": 3.52e-06,
+      "loss": 0.0129203662276268,
+      "num_tokens": 240663.0,
+      "reward": 1.166100013256073,
+      "reward_std": 0.27513332962989806,
+      "rewards/reward_func/mean": 1.166100013256073,
+      "rewards/reward_func/std": 0.275133341550827,
+      "step": 75,
+      "step_time": 15.653593644002104,
+      "tools/call_frequency": 3.15,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4,
+      "completions/max_length": 250.6,
+      "completions/max_terminated_length": 235.4,
+      "completions/mean_length": 217.2,
+      "completions/mean_terminated_length": 209.23333740234375,
+      "completions/min_length": 169.8,
+      "completions/min_terminated_length": 174.0,
+      "entropy": 0.18624852728098631,
+      "epoch": 0.5,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.9008662700653076,
+      "kl": 0.19779104925692081,
+      "learning_rate": 3.4200000000000007e-06,
+      "loss": -0.060715597867965695,
+      "num_tokens": 257232.0,
+      "reward": 1.094600009918213,
+      "reward_std": 0.533681058883667,
+      "rewards/reward_func/mean": 1.094600009918213,
+      "rewards/reward_func/std": 0.5336810708045959,
+      "step": 80,
+      "step_time": 16.87674882839783,
+      "tools/call_frequency": 2.7,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8,
+      "completions/max_length": 263.2,
+      "completions/max_terminated_length": 149.0,
+      "completions/mean_length": 238.9,
+      "completions/mean_terminated_length": 138.2,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 127.4,
+      "entropy": 0.06761846686713398,
+      "epoch": 0.53125,
+      "frac_reward_zero_std": 0.6,
+      "grad_norm": 0.0583312027156353,
+      "kl": 0.16298045124858618,
+      "learning_rate": 3.3200000000000004e-06,
+      "loss": 0.0317715585231781,
+      "num_tokens": 274377.0,
+      "reward": 1.168333351612091,
+      "reward_std": 0.21399999260902405,
+      "rewards/reward_func/mean": 1.168333351612091,
+      "rewards/reward_func/std": 0.214000004529953,
+      "step": 85,
+      "step_time": 19.253501980405417,
+      "tools/call_frequency": 2.45,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9,
+      "completions/max_length": 252.0,
+      "completions/max_terminated_length": 79.6,
+      "completions/mean_length": 229.35,
+      "completions/mean_terminated_length": 79.6,
+      "completions/min_length": 212.6,
+      "completions/min_terminated_length": 79.6,
+      "entropy": 0.04304317501373589,
+      "epoch": 0.5625,
+      "frac_reward_zero_std": 0.4,
+      "grad_norm": 0.05765737593173981,
+      "kl": 0.1589741975069046,
+      "learning_rate": 3.2200000000000005e-06,
+      "loss": -0.009884151071310044,
+      "num_tokens": 291640.0,
+      "reward": 1.0771000266075135,
+      "reward_std": 0.2571271777153015,
+      "rewards/reward_func/mean": 1.0771000266075135,
+      "rewards/reward_func/std": 0.257127183675766,
+      "step": 90,
+      "step_time": 19.810263851404308,
+      "tools/call_frequency": 2.7,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 252.2,
+      "completions/max_terminated_length": 155.6,
+      "completions/mean_length": 227.45,
+      "completions/mean_terminated_length": 154.7,
+      "completions/min_length": 202.6,
+      "completions/min_terminated_length": 153.8,
+      "entropy": 0.03856636304408312,
+      "epoch": 0.59375,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 2.2899415493011475,
+      "kl": 0.18391469195485116,
+      "learning_rate": 3.12e-06,
+      "loss": 0.012278559803962707,
+      "num_tokens": 308671.0,
+      "reward": 0.9493666887283325,
+      "reward_std": 0.3057107627391815,
+      "rewards/reward_func/mean": 0.9493666887283325,
+      "rewards/reward_func/std": 0.3057107746601105,
+      "step": 95,
+      "step_time": 18.270148772597896,
+      "tools/call_frequency": 2.75,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65,
+      "completions/max_length": 229.0,
+      "completions/max_terminated_length": 128.6,
+      "completions/mean_length": 210.05,
+      "completions/mean_terminated_length": 121.23333435058593,
+      "completions/min_length": 191.4,
+      "completions/min_terminated_length": 114.6,
+      "entropy": 0.03718785918317735,
+      "epoch": 0.625,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 1.4016427993774414,
+      "kl": 0.19276840873062612,
+      "learning_rate": 3.0200000000000003e-06,
+      "loss": -0.02043401300907135,
+      "num_tokens": 325246.0,
+      "reward": 0.9758000135421753,
+      "reward_std": 0.439729905128479,
+      "rewards/reward_func/mean": 0.9758000135421753,
+      "rewards/reward_func/std": 0.439729905128479,
+      "step": 100,
+      "step_time": 16.536685503809714,
+      "tools/call_frequency": 3.4,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.45,
+      "completions/max_length": 231.0,
+      "completions/max_terminated_length": 175.6,
+      "completions/mean_length": 209.7,
+      "completions/mean_terminated_length": 164.73333435058595,
+      "completions/min_length": 197.2,
+      "completions/min_terminated_length": 156.2,
+      "entropy": 0.0890876273624599,
+      "epoch": 0.65625,
+      "frac_reward_zero_std": 0.6,
+      "grad_norm": 1.857412338256836,
+      "kl": 0.20793221928179265,
+      "learning_rate": 2.92e-06,
+      "loss": 0.010671529173851012,
+      "num_tokens": 341743.0,
+      "reward": 1.244200015068054,
+      "reward_std": 0.25437753796577456,
+      "rewards/reward_func/mean": 1.244200015068054,
+      "rewards/reward_func/std": 0.25437754988670347,
+      "step": 105,
+      "step_time": 14.550393618003,
+      "tools/call_frequency": 3.4,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6,
+      "completions/max_length": 228.0,
+      "completions/max_terminated_length": 215.0,
+      "completions/mean_length": 211.05,
+      "completions/mean_terminated_length": 204.6,
+      "completions/min_length": 194.2,
+      "completions/min_terminated_length": 194.2,
+      "entropy": 0.09650332322344184,
+      "epoch": 0.6875,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.5915409922599792,
+      "kl": 0.1943995427340269,
+      "learning_rate": 2.82e-06,
+      "loss": -0.007803649455308914,
+      "num_tokens": 358474.0,
+      "reward": 0.9213667035102844,
+      "reward_std": 0.48010437488555907,
+      "rewards/reward_func/mean": 0.9213667035102844,
+      "rewards/reward_func/std": 0.480104398727417,
+      "step": 110,
+      "step_time": 15.29034832160105,
+      "tools/call_frequency": 3.3,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 244.8,
+      "completions/max_terminated_length": 244.8,
+      "completions/mean_length": 224.4,
+      "completions/mean_terminated_length": 232.93333435058594,
+      "completions/min_length": 204.4,
+      "completions/min_terminated_length": 219.8,
+      "entropy": 0.06257005939260125,
+      "epoch": 0.71875,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 0.09643584489822388,
+      "kl": 0.18671961799263953,
+      "learning_rate": 2.7200000000000002e-06,
+      "loss": 0.0009367348626255989,
+      "num_tokens": 375512.0,
+      "reward": 0.9198000192642212,
+      "reward_std": 0.41239041090011597,
+      "rewards/reward_func/mean": 0.9198000192642212,
+      "rewards/reward_func/std": 0.41239042282104493,
+      "step": 115,
+      "step_time": 16.68962257000094,
+      "tools/call_frequency": 3.05,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 238.4,
+      "completions/max_terminated_length": 194.2,
+      "completions/mean_length": 221.6,
+      "completions/mean_terminated_length": 185.86666870117188,
+      "completions/min_length": 202.6,
+      "completions/min_terminated_length": 174.8,
+      "entropy": 0.19831047160550952,
+      "epoch": 0.75,
+      "frac_reward_zero_std": 0.4,
+      "grad_norm": 0.06480103731155396,
+      "kl": 0.2127195455133915,
+      "learning_rate": 2.6200000000000003e-06,
+      "loss": -0.002893347479403019,
+      "num_tokens": 392259.0,
+      "reward": 1.1177000164985658,
+      "reward_std": 0.34459384679794314,
+      "rewards/reward_func/mean": 1.1177000164985658,
+      "rewards/reward_func/std": 0.34459385871887205,
+      "step": 120,
+      "step_time": 15.74592421480629,
+      "tools/call_frequency": 3.1,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.55,
+      "completions/max_length": 249.6,
+      "completions/max_terminated_length": 248.4,
+      "completions/mean_length": 225.9,
+      "completions/mean_terminated_length": 238.83333435058594,
+      "completions/min_length": 207.6,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 0.12472135615535081,
+      "epoch": 0.78125,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 1.3778189420700073,
+      "kl": 0.22096077986061574,
+      "learning_rate": 2.52e-06,
+      "loss": 0.018771570920944215,
+      "num_tokens": 409108.0,
+      "reward": 0.6021333426237107,
+      "reward_std": 0.6133833765983582,
+      "rewards/reward_func/mean": 0.6021333426237107,
+      "rewards/reward_func/std": 0.6133833885192871,
+      "step": 125,
+      "step_time": 17.135429813191877,
+      "tools/call_frequency": 3.0,
+      "tools/failure_frequency": 0.0
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 409108,
+  "num_train_epochs": 2,
+  "save_steps": 125,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/checkpoint-125/training_args.bin
+++ b/checkpoint-125/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:623158c464bdeaec4fa82a52817596e39771e95ece6659a737aa9e9d07d03327
+size 7185