commit d5d0e722afd077a00a9bc2ace010f331537718bc Author: ModelHub XC Date: Tue Jun 16 08:16:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..0652f22 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,39 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-125/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +sft_warm_start/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..f5a7f63 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +--- +library_name: transformers +model_name: counsel-sft-grpo-output +tags: +- generated_from_trainer +- hf_jobs +- trl +- grpo +licence: license +--- + +# Model Card for counsel-sft-grpo-output + +This model is a fine-tuned version of [None](https://huggingface.co/None). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 1.2.0 +- Transformers: 5.6.2 +- Pytorch: 2.11.0 +- Datasets: 4.8.4 +- Tokenizers: 0.22.2 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-125/chat_template.jinja b/checkpoint-125/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/checkpoint-125/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-125/config.json b/checkpoint-125/config.json new file mode 100644 index 0000000..9b2e878 --- /dev/null +++ b/checkpoint-125/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.6.2", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-125/generation_config.json b/checkpoint-125/generation_config.json new file mode 100644 index 0000000..0ecce6e --- /dev/null +++ b/checkpoint-125/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.6.2" +} diff --git a/checkpoint-125/model.safetensors b/checkpoint-125/model.safetensors new file mode 100644 index 0000000..62aefeb --- /dev/null +++ b/checkpoint-125/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cfa4edd146d430b8bbff0802fd8c777682b57609ee370304d76982a5919498b +size 2384234968 diff --git a/checkpoint-125/optimizer.pt b/checkpoint-125/optimizer.pt new file mode 100644 index 0000000..43e9eb5 --- /dev/null +++ b/checkpoint-125/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ca431c190b97d71d34b7476e9c0b446b1f7a0854090fdc7233f7e708432c5f4 +size 4768669395 diff --git a/checkpoint-125/rng_state.pth b/checkpoint-125/rng_state.pth new file mode 100644 index 0000000..d9e47d3 --- /dev/null +++ b/checkpoint-125/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce9d06b6c01a6abb92a5b52d5d361d96821b8e35f68158be1830efadebe0f334 +size 14645 diff --git a/checkpoint-125/scheduler.pt b/checkpoint-125/scheduler.pt new file mode 100644 index 0000000..055f813 --- /dev/null +++ b/checkpoint-125/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e084e51a71375a2e59178fefd4967d7cdc87b72fa22758078088a60489f32adb +size 1465 diff --git a/checkpoint-125/tokenizer.json b/checkpoint-125/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/checkpoint-125/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/checkpoint-125/tokenizer_config.json b/checkpoint-125/tokenizer_config.json new file mode 100644 index 0000000..f8a0cff --- /dev/null +++ b/checkpoint-125/tokenizer_config.json @@ -0,0 +1,78 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "local_files_only": false, + "max_length": 3072, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "reasoning_content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object" + }, + "name": { + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "function" + } + }, + "type": "object", + "x-parser": "json", + "x-parser-args": { + "transform": "{type: 'function', function: @}" + } + }, + "type": "array", + "x-regex-iterator": "\\s*(.+?)\\s*" + } + }, + "type": "object", + "x-regex": "^(?:\\n?(?:(?P.*?\\S.*?)\\n?|[\\s]*)\\s*)?(?P.*?)(?:\\n(?=))?(?=(?:|<\\|im_end\\|>|$))(?P(?:.+?\\s*)+)?\\s*(?:<\\|im_end\\|>|$)" + }, + "split_special_tokens": false, + "stride": 0, + "tokenizer_class": "Qwen2Tokenizer", + "truncation_side": "left", + "truncation_strategy": "longest_first", + "unk_token": null +} diff --git a/checkpoint-125/trainer_state.json b/checkpoint-125/trainer_state.json new file mode 100644 index 0000000..78dd407 --- /dev/null +++ b/checkpoint-125/trainer_state.json @@ -0,0 +1,784 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.78125, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 199.6, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 171.9, + "completions/mean_terminated_length": 122.36666870117188, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.06970996516756714, + "epoch": 0.03125, + "frac_reward_zero_std": 0.6, + "grad_norm": 3.630038261413574, + "kl": 0.00014932112862879875, + "learning_rate": 4.92e-06, + "loss": 0.029165178537368774, + "num_tokens": 15758.0, + "reward": -0.31389998495578764, + "reward_std": 0.2122000053524971, + "rewards/reward_func/mean": -0.31389998495578764, + "rewards/reward_func/std": 0.21219999492168426, + "step": 5, + "step_time": 14.728857926794444, + "tools/call_frequency": 3.45, + "tools/failure_frequency": 0.21573015451431274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.6, + "completions/max_terminated_length": 168.6, + "completions/mean_length": 148.3, + "completions/mean_terminated_length": 148.3, + "completions/min_length": 129.6, + "completions/min_terminated_length": 129.6, + "entropy": 0.042718362715095284, + "epoch": 0.0625, + "frac_reward_zero_std": 0.2, + "grad_norm": 3.325033187866211, + "kl": 0.037860750965774057, + "learning_rate": 4.8200000000000004e-06, + "loss": -0.011221970617771148, + "num_tokens": 31053.0, + "reward": 0.2989000082015991, + "reward_std": 0.4415143087506294, + "rewards/reward_func/mean": 0.2989000082015991, + "rewards/reward_func/std": 0.4415143221616745, + "step": 10, + "step_time": 9.975367512006779, + "tools/call_frequency": 2.5, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.8, + "completions/max_terminated_length": 152.8, + "completions/mean_length": 131.3, + "completions/mean_terminated_length": 131.3, + "completions/min_length": 113.6, + "completions/min_terminated_length": 113.6, + "entropy": 0.016039706021547317, + "epoch": 0.09375, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.1289054155349731, + "kl": 0.06640795171260834, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.04752160608768463, + "num_tokens": 45857.0, + "reward": 1.1023000121116637, + "reward_std": 0.4320605039596558, + "rewards/reward_func/mean": 1.1023000121116637, + "rewards/reward_func/std": 0.43206052780151366, + "step": 15, + "step_time": 8.620344271202338, + "tools/call_frequency": 2.35, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 136.55, + "completions/mean_terminated_length": 137.2500030517578, + "completions/min_length": 111.4, + "completions/min_terminated_length": 111.4, + "entropy": 0.027425602450966834, + "epoch": 0.125, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.8991426229476929, + "kl": 0.09577701878733932, + "learning_rate": 4.620000000000001e-06, + "loss": -0.1201351523399353, + "num_tokens": 60826.0, + "reward": 0.7200000047683716, + "reward_std": 0.3419178485870361, + "rewards/reward_func/mean": 0.7200000047683716, + "rewards/reward_func/std": 0.3419178485870361, + "step": 20, + "step_time": 11.403528443601681, + "tools/call_frequency": 2.05, + "tools/failure_frequency": 0.026666668057441712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 203.2, + "completions/max_terminated_length": 161.2, + "completions/mean_length": 188.6, + "completions/mean_terminated_length": 150.6666687011719, + "completions/min_length": 173.4, + "completions/min_terminated_length": 139.6, + "entropy": 0.033282498246990144, + "epoch": 0.15625, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.041987657546997, + "kl": 0.051508421916514634, + "learning_rate": 4.520000000000001e-06, + "loss": 0.03198407888412476, + "num_tokens": 76838.0, + "reward": 1.2669333696365357, + "reward_std": 0.3234894543886185, + "rewards/reward_func/mean": 1.2669333696365357, + "rewards/reward_func/std": 0.32348946332931516, + "step": 25, + "step_time": 13.736867211584467, + "tools/call_frequency": 4.4, + "tools/failure_frequency": 0.14583333432674409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 209.4, + "completions/max_terminated_length": 160.8, + "completions/mean_length": 192.45, + "completions/mean_terminated_length": 148.3, + "completions/min_length": 169.2, + "completions/min_terminated_length": 135.8, + "entropy": 0.04025774166220799, + "epoch": 0.1875, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.6383038759231567, + "kl": 0.09242036554496735, + "learning_rate": 4.42e-06, + "loss": -0.03659022152423859, + "num_tokens": 93054.0, + "reward": 1.0333500146865844, + "reward_std": 0.38981522917747496, + "rewards/reward_func/mean": 1.0333500146865844, + "rewards/reward_func/std": 0.389815217256546, + "step": 30, + "step_time": 14.735964270806289, + "tools/call_frequency": 3.85, + "tools/failure_frequency": 0.023529411852359773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 207.4, + "completions/max_terminated_length": 168.8, + "completions/mean_length": 196.4, + "completions/mean_terminated_length": 166.7, + "completions/min_length": 191.2, + "completions/min_terminated_length": 164.6, + "entropy": 0.02646293715806678, + "epoch": 0.21875, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.6842532157897949, + "kl": 0.09354882184416055, + "learning_rate": 4.32e-06, + "loss": 0.014650090038776398, + "num_tokens": 109141.0, + "reward": 1.0134333491325378, + "reward_std": 0.28623148798942566, + "rewards/reward_func/mean": 1.0134333491325378, + "rewards/reward_func/std": 0.2862314820289612, + "step": 35, + "step_time": 14.25194917320332, + "tools/call_frequency": 3.95, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 228.8, + "completions/max_terminated_length": 163.6, + "completions/mean_length": 207.8, + "completions/mean_terminated_length": 162.3, + "completions/min_length": 188.8, + "completions/min_terminated_length": 161.0, + "entropy": 0.049686831969302146, + "epoch": 0.25, + "frac_reward_zero_std": 0.4, + "grad_norm": 2.386836528778076, + "kl": 0.12552661653608083, + "learning_rate": 4.22e-06, + "loss": 0.023246073722839357, + "num_tokens": 125712.0, + "reward": 0.9764333426952362, + "reward_std": 0.3545127585530281, + "rewards/reward_func/mean": 0.9764333426952362, + "rewards/reward_func/std": 0.35451277494430544, + "step": 40, + "step_time": 16.735324517198023, + "tools/call_frequency": 3.45, + "tools/failure_frequency": 0.02857142984867096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8, + "completions/max_length": 231.4, + "completions/max_terminated_length": 127.2, + "completions/mean_length": 209.4, + "completions/mean_terminated_length": 124.2, + "completions/min_length": 187.2, + "completions/min_terminated_length": 121.2, + "entropy": 0.14096241008955984, + "epoch": 0.28125, + "frac_reward_zero_std": 0.2, + "grad_norm": 5.072839260101318, + "kl": 0.10897002797573804, + "learning_rate": 4.12e-06, + "loss": 0.05337468385696411, + "num_tokens": 142131.0, + "reward": 1.0291000008583069, + "reward_std": 0.5297403573989868, + "rewards/reward_func/mean": 1.0291000008583069, + "rewards/reward_func/std": 0.5297403573989868, + "step": 45, + "step_time": 17.371078941601446, + "tools/call_frequency": 3.4, + "tools/failure_frequency": 0.01428571492433548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 237.2, + "completions/max_terminated_length": 172.2, + "completions/mean_length": 201.7, + "completions/mean_terminated_length": 160.46666870117187, + "completions/min_length": 186.0, + "completions/min_terminated_length": 150.8, + "entropy": 0.1540619947016239, + "epoch": 0.3125, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.2555952072143555, + "kl": 0.17082785218954086, + "learning_rate": 4.0200000000000005e-06, + "loss": 0.06733548641204834, + "num_tokens": 158431.0, + "reward": 0.8427666783332824, + "reward_std": 0.6860074520111084, + "rewards/reward_func/mean": 0.8427666783332824, + "rewards/reward_func/std": 0.6860074281692505, + "step": 50, + "step_time": 17.60776922639343, + "tools/call_frequency": 3.6, + "tools/failure_frequency": 0.027619048953056335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 209.2, + "completions/max_terminated_length": 209.2, + "completions/mean_length": 198.0, + "completions/mean_terminated_length": 206.43333435058594, + "completions/min_length": 189.4, + "completions/min_terminated_length": 203.8, + "entropy": 0.11417091116309167, + "epoch": 0.34375, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.1612984985113144, + "kl": 0.14481508396565915, + "learning_rate": 3.920000000000001e-06, + "loss": -0.0013940947130322457, + "num_tokens": 174665.0, + "reward": 1.337833333015442, + "reward_std": 0.04058598577976227, + "rewards/reward_func/mean": 1.337833333015442, + "rewards/reward_func/std": 0.04058598577976227, + "step": 55, + "step_time": 13.894916865596315, + "tools/call_frequency": 3.9, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 210.6, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 189.7, + "completions/mean_terminated_length": 184.93333435058594, + "completions/min_length": 152.0, + "completions/min_terminated_length": 153.4, + "entropy": 0.18207021439447998, + "epoch": 0.375, + "frac_reward_zero_std": 0.2, + "grad_norm": 7.57163667678833, + "kl": 0.2769763808697462, + "learning_rate": 3.820000000000001e-06, + "loss": -0.08738029599189759, + "num_tokens": 190974.0, + "reward": 0.9539999723434448, + "reward_std": 0.24900673925876618, + "rewards/reward_func/mean": 0.9539999723434448, + "rewards/reward_func/std": 0.2490067459642887, + "step": 60, + "step_time": 13.735741792595945, + "tools/call_frequency": 3.35, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 213.6, + "completions/max_terminated_length": 186.6, + "completions/mean_length": 196.15, + "completions/mean_terminated_length": 181.2, + "completions/min_length": 173.2, + "completions/min_terminated_length": 177.0, + "entropy": 0.18931779703125357, + "epoch": 0.40625, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.3368631601333618, + "kl": 0.19928277991712093, + "learning_rate": 3.7200000000000004e-06, + "loss": -0.03082091510295868, + "num_tokens": 207221.0, + "reward": 1.1948333382606506, + "reward_std": 0.3531351625919342, + "rewards/reward_func/mean": 1.1948333382606506, + "rewards/reward_func/std": 0.3531351566314697, + "step": 65, + "step_time": 14.853071747999639, + "tools/call_frequency": 3.45, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 232.4, + "completions/max_terminated_length": 231.4, + "completions/mean_length": 214.8, + "completions/mean_terminated_length": 221.10000305175782, + "completions/min_length": 199.4, + "completions/min_terminated_length": 211.6, + "entropy": 0.20331259737722576, + "epoch": 0.4375, + "frac_reward_zero_std": 0.6, + "grad_norm": 3.155299663543701, + "kl": 0.21616111248731612, + "learning_rate": 3.62e-06, + "loss": -0.014388753473758698, + "num_tokens": 223949.0, + "reward": 1.187999999523163, + "reward_std": 0.06400000005960464, + "rewards/reward_func/mean": 1.187999999523163, + "rewards/reward_func/std": 0.06399999856948853, + "step": 70, + "step_time": 15.7972018689994, + "tools/call_frequency": 3.3, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 228.0, + "completions/max_terminated_length": 180.2, + "completions/mean_length": 217.2, + "completions/mean_terminated_length": 174.9, + "completions/min_length": 208.2, + "completions/min_terminated_length": 169.6, + "entropy": 0.09874274502508343, + "epoch": 0.46875, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1496252417564392, + "kl": 0.19251887053251265, + "learning_rate": 3.52e-06, + "loss": 0.0129203662276268, + "num_tokens": 240663.0, + "reward": 1.166100013256073, + "reward_std": 0.27513332962989806, + "rewards/reward_func/mean": 1.166100013256073, + "rewards/reward_func/std": 0.275133341550827, + "step": 75, + "step_time": 15.653593644002104, + "tools/call_frequency": 3.15, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 250.6, + "completions/max_terminated_length": 235.4, + "completions/mean_length": 217.2, + "completions/mean_terminated_length": 209.23333740234375, + "completions/min_length": 169.8, + "completions/min_terminated_length": 174.0, + "entropy": 0.18624852728098631, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9008662700653076, + "kl": 0.19779104925692081, + "learning_rate": 3.4200000000000007e-06, + "loss": -0.060715597867965695, + "num_tokens": 257232.0, + "reward": 1.094600009918213, + "reward_std": 0.533681058883667, + "rewards/reward_func/mean": 1.094600009918213, + "rewards/reward_func/std": 0.5336810708045959, + "step": 80, + "step_time": 16.87674882839783, + "tools/call_frequency": 2.7, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8, + "completions/max_length": 263.2, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 238.9, + "completions/mean_terminated_length": 138.2, + "completions/min_length": 220.0, + "completions/min_terminated_length": 127.4, + "entropy": 0.06761846686713398, + "epoch": 0.53125, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0583312027156353, + "kl": 0.16298045124858618, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.0317715585231781, + "num_tokens": 274377.0, + "reward": 1.168333351612091, + "reward_std": 0.21399999260902405, + "rewards/reward_func/mean": 1.168333351612091, + "rewards/reward_func/std": 0.214000004529953, + "step": 85, + "step_time": 19.253501980405417, + "tools/call_frequency": 2.45, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9, + "completions/max_length": 252.0, + "completions/max_terminated_length": 79.6, + "completions/mean_length": 229.35, + "completions/mean_terminated_length": 79.6, + "completions/min_length": 212.6, + "completions/min_terminated_length": 79.6, + "entropy": 0.04304317501373589, + "epoch": 0.5625, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.05765737593173981, + "kl": 0.1589741975069046, + "learning_rate": 3.2200000000000005e-06, + "loss": -0.009884151071310044, + "num_tokens": 291640.0, + "reward": 1.0771000266075135, + "reward_std": 0.2571271777153015, + "rewards/reward_func/mean": 1.0771000266075135, + "rewards/reward_func/std": 0.257127183675766, + "step": 90, + "step_time": 19.810263851404308, + "tools/call_frequency": 2.7, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 252.2, + "completions/max_terminated_length": 155.6, + "completions/mean_length": 227.45, + "completions/mean_terminated_length": 154.7, + "completions/min_length": 202.6, + "completions/min_terminated_length": 153.8, + "entropy": 0.03856636304408312, + "epoch": 0.59375, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.2899415493011475, + "kl": 0.18391469195485116, + "learning_rate": 3.12e-06, + "loss": 0.012278559803962707, + "num_tokens": 308671.0, + "reward": 0.9493666887283325, + "reward_std": 0.3057107627391815, + "rewards/reward_func/mean": 0.9493666887283325, + "rewards/reward_func/std": 0.3057107746601105, + "step": 95, + "step_time": 18.270148772597896, + "tools/call_frequency": 2.75, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 229.0, + "completions/max_terminated_length": 128.6, + "completions/mean_length": 210.05, + "completions/mean_terminated_length": 121.23333435058593, + "completions/min_length": 191.4, + "completions/min_terminated_length": 114.6, + "entropy": 0.03718785918317735, + "epoch": 0.625, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.4016427993774414, + "kl": 0.19276840873062612, + "learning_rate": 3.0200000000000003e-06, + "loss": -0.02043401300907135, + "num_tokens": 325246.0, + "reward": 0.9758000135421753, + "reward_std": 0.439729905128479, + "rewards/reward_func/mean": 0.9758000135421753, + "rewards/reward_func/std": 0.439729905128479, + "step": 100, + "step_time": 16.536685503809714, + "tools/call_frequency": 3.4, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 231.0, + "completions/max_terminated_length": 175.6, + "completions/mean_length": 209.7, + "completions/mean_terminated_length": 164.73333435058595, + "completions/min_length": 197.2, + "completions/min_terminated_length": 156.2, + "entropy": 0.0890876273624599, + "epoch": 0.65625, + "frac_reward_zero_std": 0.6, + "grad_norm": 1.857412338256836, + "kl": 0.20793221928179265, + "learning_rate": 2.92e-06, + "loss": 0.010671529173851012, + "num_tokens": 341743.0, + "reward": 1.244200015068054, + "reward_std": 0.25437753796577456, + "rewards/reward_func/mean": 1.244200015068054, + "rewards/reward_func/std": 0.25437754988670347, + "step": 105, + "step_time": 14.550393618003, + "tools/call_frequency": 3.4, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 228.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 211.05, + "completions/mean_terminated_length": 204.6, + "completions/min_length": 194.2, + "completions/min_terminated_length": 194.2, + "entropy": 0.09650332322344184, + "epoch": 0.6875, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5915409922599792, + "kl": 0.1943995427340269, + "learning_rate": 2.82e-06, + "loss": -0.007803649455308914, + "num_tokens": 358474.0, + "reward": 0.9213667035102844, + "reward_std": 0.48010437488555907, + "rewards/reward_func/mean": 0.9213667035102844, + "rewards/reward_func/std": 0.480104398727417, + "step": 110, + "step_time": 15.29034832160105, + "tools/call_frequency": 3.3, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 244.8, + "completions/max_terminated_length": 244.8, + "completions/mean_length": 224.4, + "completions/mean_terminated_length": 232.93333435058594, + "completions/min_length": 204.4, + "completions/min_terminated_length": 219.8, + "entropy": 0.06257005939260125, + "epoch": 0.71875, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.09643584489822388, + "kl": 0.18671961799263953, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.0009367348626255989, + "num_tokens": 375512.0, + "reward": 0.9198000192642212, + "reward_std": 0.41239041090011597, + "rewards/reward_func/mean": 0.9198000192642212, + "rewards/reward_func/std": 0.41239042282104493, + "step": 115, + "step_time": 16.68962257000094, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 238.4, + "completions/max_terminated_length": 194.2, + "completions/mean_length": 221.6, + "completions/mean_terminated_length": 185.86666870117188, + "completions/min_length": 202.6, + "completions/min_terminated_length": 174.8, + "entropy": 0.19831047160550952, + "epoch": 0.75, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.06480103731155396, + "kl": 0.2127195455133915, + "learning_rate": 2.6200000000000003e-06, + "loss": -0.002893347479403019, + "num_tokens": 392259.0, + "reward": 1.1177000164985658, + "reward_std": 0.34459384679794314, + "rewards/reward_func/mean": 1.1177000164985658, + "rewards/reward_func/std": 0.34459385871887205, + "step": 120, + "step_time": 15.74592421480629, + "tools/call_frequency": 3.1, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 249.6, + "completions/max_terminated_length": 248.4, + "completions/mean_length": 225.9, + "completions/mean_terminated_length": 238.83333435058594, + "completions/min_length": 207.6, + "completions/min_terminated_length": 229.0, + "entropy": 0.12472135615535081, + "epoch": 0.78125, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.3778189420700073, + "kl": 0.22096077986061574, + "learning_rate": 2.52e-06, + "loss": 0.018771570920944215, + "num_tokens": 409108.0, + "reward": 0.6021333426237107, + "reward_std": 0.6133833765983582, + "rewards/reward_func/mean": 0.6021333426237107, + "rewards/reward_func/std": 0.6133833885192871, + "step": 125, + "step_time": 17.135429813191877, + "tools/call_frequency": 3.0, + "tools/failure_frequency": 0.0 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 409108, + "num_train_epochs": 2, + "save_steps": 125, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-125/training_args.bin b/checkpoint-125/training_args.bin new file mode 100644 index 0000000..2f7d0b0 --- /dev/null +++ b/checkpoint-125/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:623158c464bdeaec4fa82a52817596e39771e95ece6659a737aa9e9d07d03327 +size 7185 diff --git a/checkpoint-250/chat_template.jinja b/checkpoint-250/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/checkpoint-250/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-250/config.json b/checkpoint-250/config.json new file mode 100644 index 0000000..9b2e878 --- /dev/null +++ b/checkpoint-250/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.6.2", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-250/generation_config.json b/checkpoint-250/generation_config.json new file mode 100644 index 0000000..0ecce6e --- /dev/null +++ b/checkpoint-250/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.6.2" +} diff --git a/checkpoint-250/model.safetensors b/checkpoint-250/model.safetensors new file mode 100644 index 0000000..3a98e07 --- /dev/null +++ b/checkpoint-250/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eb33e25e043c4103e6cee6aa637be513e7c2c0b7afb8cdd2325c7891721e22f +size 2384234968 diff --git a/checkpoint-250/optimizer.pt b/checkpoint-250/optimizer.pt new file mode 100644 index 0000000..bfe1b49 --- /dev/null +++ b/checkpoint-250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad30d0785567ceb6d1472c0c58b027a9569701ec0b3f7f9e1b6557412d8231c6 +size 4768669395 diff --git a/checkpoint-250/rng_state.pth b/checkpoint-250/rng_state.pth new file mode 100644 index 0000000..d73cdd8 --- /dev/null +++ b/checkpoint-250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b0bd96b0d28d762e23adea21cbb0a4f5ae44ffadba986e6529b752e987c40c +size 14645 diff --git a/checkpoint-250/scheduler.pt b/checkpoint-250/scheduler.pt new file mode 100644 index 0000000..b33f80d --- /dev/null +++ b/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa68c9756e4b45f9fc5e31507423afdfe40083f2b850885e7dec29d5f3970b1f +size 1465 diff --git a/checkpoint-250/tokenizer.json b/checkpoint-250/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/checkpoint-250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/checkpoint-250/tokenizer_config.json b/checkpoint-250/tokenizer_config.json new file mode 100644 index 0000000..f8a0cff --- /dev/null +++ b/checkpoint-250/tokenizer_config.json @@ -0,0 +1,78 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "local_files_only": false, + "max_length": 3072, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "reasoning_content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object" + }, + "name": { + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "function" + } + }, + "type": "object", + "x-parser": "json", + "x-parser-args": { + "transform": "{type: 'function', function: @}" + } + }, + "type": "array", + "x-regex-iterator": "\\s*(.+?)\\s*" + } + }, + "type": "object", + "x-regex": "^(?:\\n?(?:(?P.*?\\S.*?)\\n?|[\\s]*)\\s*)?(?P.*?)(?:\\n(?=))?(?=(?:|<\\|im_end\\|>|$))(?P(?:.+?\\s*)+)?\\s*(?:<\\|im_end\\|>|$)" + }, + "split_special_tokens": false, + "stride": 0, + "tokenizer_class": "Qwen2Tokenizer", + "truncation_side": "left", + "truncation_strategy": "longest_first", + "unk_token": null +} diff --git a/checkpoint-250/trainer_state.json b/checkpoint-250/trainer_state.json new file mode 100644 index 0000000..ea7351f --- /dev/null +++ b/checkpoint-250/trainer_state.json @@ -0,0 +1,1534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5625, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 199.6, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 171.9, + "completions/mean_terminated_length": 122.36666870117188, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.06970996516756714, + "epoch": 0.03125, + "frac_reward_zero_std": 0.6, + "grad_norm": 3.630038261413574, + "kl": 0.00014932112862879875, + "learning_rate": 4.92e-06, + "loss": 0.029165178537368774, + "num_tokens": 15758.0, + "reward": -0.31389998495578764, + "reward_std": 0.2122000053524971, + "rewards/reward_func/mean": -0.31389998495578764, + "rewards/reward_func/std": 0.21219999492168426, + "step": 5, + "step_time": 14.728857926794444, + "tools/call_frequency": 3.45, + "tools/failure_frequency": 0.21573015451431274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.6, + "completions/max_terminated_length": 168.6, + "completions/mean_length": 148.3, + "completions/mean_terminated_length": 148.3, + "completions/min_length": 129.6, + "completions/min_terminated_length": 129.6, + "entropy": 0.042718362715095284, + "epoch": 0.0625, + "frac_reward_zero_std": 0.2, + "grad_norm": 3.325033187866211, + "kl": 0.037860750965774057, + "learning_rate": 4.8200000000000004e-06, + "loss": -0.011221970617771148, + "num_tokens": 31053.0, + "reward": 0.2989000082015991, + "reward_std": 0.4415143087506294, + "rewards/reward_func/mean": 0.2989000082015991, + "rewards/reward_func/std": 0.4415143221616745, + "step": 10, + "step_time": 9.975367512006779, + "tools/call_frequency": 2.5, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.8, + "completions/max_terminated_length": 152.8, + "completions/mean_length": 131.3, + "completions/mean_terminated_length": 131.3, + "completions/min_length": 113.6, + "completions/min_terminated_length": 113.6, + "entropy": 0.016039706021547317, + "epoch": 0.09375, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.1289054155349731, + "kl": 0.06640795171260834, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.04752160608768463, + "num_tokens": 45857.0, + "reward": 1.1023000121116637, + "reward_std": 0.4320605039596558, + "rewards/reward_func/mean": 1.1023000121116637, + "rewards/reward_func/std": 0.43206052780151366, + "step": 15, + "step_time": 8.620344271202338, + "tools/call_frequency": 2.35, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 136.55, + "completions/mean_terminated_length": 137.2500030517578, + "completions/min_length": 111.4, + "completions/min_terminated_length": 111.4, + "entropy": 0.027425602450966834, + "epoch": 0.125, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.8991426229476929, + "kl": 0.09577701878733932, + "learning_rate": 4.620000000000001e-06, + "loss": -0.1201351523399353, + "num_tokens": 60826.0, + "reward": 0.7200000047683716, + "reward_std": 0.3419178485870361, + "rewards/reward_func/mean": 0.7200000047683716, + "rewards/reward_func/std": 0.3419178485870361, + "step": 20, + "step_time": 11.403528443601681, + "tools/call_frequency": 2.05, + "tools/failure_frequency": 0.026666668057441712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 203.2, + "completions/max_terminated_length": 161.2, + "completions/mean_length": 188.6, + "completions/mean_terminated_length": 150.6666687011719, + "completions/min_length": 173.4, + "completions/min_terminated_length": 139.6, + "entropy": 0.033282498246990144, + "epoch": 0.15625, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.041987657546997, + "kl": 0.051508421916514634, + "learning_rate": 4.520000000000001e-06, + "loss": 0.03198407888412476, + "num_tokens": 76838.0, + "reward": 1.2669333696365357, + "reward_std": 0.3234894543886185, + "rewards/reward_func/mean": 1.2669333696365357, + "rewards/reward_func/std": 0.32348946332931516, + "step": 25, + "step_time": 13.736867211584467, + "tools/call_frequency": 4.4, + "tools/failure_frequency": 0.14583333432674409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 209.4, + "completions/max_terminated_length": 160.8, + "completions/mean_length": 192.45, + "completions/mean_terminated_length": 148.3, + "completions/min_length": 169.2, + "completions/min_terminated_length": 135.8, + "entropy": 0.04025774166220799, + "epoch": 0.1875, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.6383038759231567, + "kl": 0.09242036554496735, + "learning_rate": 4.42e-06, + "loss": -0.03659022152423859, + "num_tokens": 93054.0, + "reward": 1.0333500146865844, + "reward_std": 0.38981522917747496, + "rewards/reward_func/mean": 1.0333500146865844, + "rewards/reward_func/std": 0.389815217256546, + "step": 30, + "step_time": 14.735964270806289, + "tools/call_frequency": 3.85, + "tools/failure_frequency": 0.023529411852359773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 207.4, + "completions/max_terminated_length": 168.8, + "completions/mean_length": 196.4, + "completions/mean_terminated_length": 166.7, + "completions/min_length": 191.2, + "completions/min_terminated_length": 164.6, + "entropy": 0.02646293715806678, + "epoch": 0.21875, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.6842532157897949, + "kl": 0.09354882184416055, + "learning_rate": 4.32e-06, + "loss": 0.014650090038776398, + "num_tokens": 109141.0, + "reward": 1.0134333491325378, + "reward_std": 0.28623148798942566, + "rewards/reward_func/mean": 1.0134333491325378, + "rewards/reward_func/std": 0.2862314820289612, + "step": 35, + "step_time": 14.25194917320332, + "tools/call_frequency": 3.95, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 228.8, + "completions/max_terminated_length": 163.6, + "completions/mean_length": 207.8, + "completions/mean_terminated_length": 162.3, + "completions/min_length": 188.8, + "completions/min_terminated_length": 161.0, + "entropy": 0.049686831969302146, + "epoch": 0.25, + "frac_reward_zero_std": 0.4, + "grad_norm": 2.386836528778076, + "kl": 0.12552661653608083, + "learning_rate": 4.22e-06, + "loss": 0.023246073722839357, + "num_tokens": 125712.0, + "reward": 0.9764333426952362, + "reward_std": 0.3545127585530281, + "rewards/reward_func/mean": 0.9764333426952362, + "rewards/reward_func/std": 0.35451277494430544, + "step": 40, + "step_time": 16.735324517198023, + "tools/call_frequency": 3.45, + "tools/failure_frequency": 0.02857142984867096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8, + "completions/max_length": 231.4, + "completions/max_terminated_length": 127.2, + "completions/mean_length": 209.4, + "completions/mean_terminated_length": 124.2, + "completions/min_length": 187.2, + "completions/min_terminated_length": 121.2, + "entropy": 0.14096241008955984, + "epoch": 0.28125, + "frac_reward_zero_std": 0.2, + "grad_norm": 5.072839260101318, + "kl": 0.10897002797573804, + "learning_rate": 4.12e-06, + "loss": 0.05337468385696411, + "num_tokens": 142131.0, + "reward": 1.0291000008583069, + "reward_std": 0.5297403573989868, + "rewards/reward_func/mean": 1.0291000008583069, + "rewards/reward_func/std": 0.5297403573989868, + "step": 45, + "step_time": 17.371078941601446, + "tools/call_frequency": 3.4, + "tools/failure_frequency": 0.01428571492433548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 237.2, + "completions/max_terminated_length": 172.2, + "completions/mean_length": 201.7, + "completions/mean_terminated_length": 160.46666870117187, + "completions/min_length": 186.0, + "completions/min_terminated_length": 150.8, + "entropy": 0.1540619947016239, + "epoch": 0.3125, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.2555952072143555, + "kl": 0.17082785218954086, + "learning_rate": 4.0200000000000005e-06, + "loss": 0.06733548641204834, + "num_tokens": 158431.0, + "reward": 0.8427666783332824, + "reward_std": 0.6860074520111084, + "rewards/reward_func/mean": 0.8427666783332824, + "rewards/reward_func/std": 0.6860074281692505, + "step": 50, + "step_time": 17.60776922639343, + "tools/call_frequency": 3.6, + "tools/failure_frequency": 0.027619048953056335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 209.2, + "completions/max_terminated_length": 209.2, + "completions/mean_length": 198.0, + "completions/mean_terminated_length": 206.43333435058594, + "completions/min_length": 189.4, + "completions/min_terminated_length": 203.8, + "entropy": 0.11417091116309167, + "epoch": 0.34375, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.1612984985113144, + "kl": 0.14481508396565915, + "learning_rate": 3.920000000000001e-06, + "loss": -0.0013940947130322457, + "num_tokens": 174665.0, + "reward": 1.337833333015442, + "reward_std": 0.04058598577976227, + "rewards/reward_func/mean": 1.337833333015442, + "rewards/reward_func/std": 0.04058598577976227, + "step": 55, + "step_time": 13.894916865596315, + "tools/call_frequency": 3.9, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 210.6, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 189.7, + "completions/mean_terminated_length": 184.93333435058594, + "completions/min_length": 152.0, + "completions/min_terminated_length": 153.4, + "entropy": 0.18207021439447998, + "epoch": 0.375, + "frac_reward_zero_std": 0.2, + "grad_norm": 7.57163667678833, + "kl": 0.2769763808697462, + "learning_rate": 3.820000000000001e-06, + "loss": -0.08738029599189759, + "num_tokens": 190974.0, + "reward": 0.9539999723434448, + "reward_std": 0.24900673925876618, + "rewards/reward_func/mean": 0.9539999723434448, + "rewards/reward_func/std": 0.2490067459642887, + "step": 60, + "step_time": 13.735741792595945, + "tools/call_frequency": 3.35, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 213.6, + "completions/max_terminated_length": 186.6, + "completions/mean_length": 196.15, + "completions/mean_terminated_length": 181.2, + "completions/min_length": 173.2, + "completions/min_terminated_length": 177.0, + "entropy": 0.18931779703125357, + "epoch": 0.40625, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.3368631601333618, + "kl": 0.19928277991712093, + "learning_rate": 3.7200000000000004e-06, + "loss": -0.03082091510295868, + "num_tokens": 207221.0, + "reward": 1.1948333382606506, + "reward_std": 0.3531351625919342, + "rewards/reward_func/mean": 1.1948333382606506, + "rewards/reward_func/std": 0.3531351566314697, + "step": 65, + "step_time": 14.853071747999639, + "tools/call_frequency": 3.45, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 232.4, + "completions/max_terminated_length": 231.4, + "completions/mean_length": 214.8, + "completions/mean_terminated_length": 221.10000305175782, + "completions/min_length": 199.4, + "completions/min_terminated_length": 211.6, + "entropy": 0.20331259737722576, + "epoch": 0.4375, + "frac_reward_zero_std": 0.6, + "grad_norm": 3.155299663543701, + "kl": 0.21616111248731612, + "learning_rate": 3.62e-06, + "loss": -0.014388753473758698, + "num_tokens": 223949.0, + "reward": 1.187999999523163, + "reward_std": 0.06400000005960464, + "rewards/reward_func/mean": 1.187999999523163, + "rewards/reward_func/std": 0.06399999856948853, + "step": 70, + "step_time": 15.7972018689994, + "tools/call_frequency": 3.3, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 228.0, + "completions/max_terminated_length": 180.2, + "completions/mean_length": 217.2, + "completions/mean_terminated_length": 174.9, + "completions/min_length": 208.2, + "completions/min_terminated_length": 169.6, + "entropy": 0.09874274502508343, + "epoch": 0.46875, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1496252417564392, + "kl": 0.19251887053251265, + "learning_rate": 3.52e-06, + "loss": 0.0129203662276268, + "num_tokens": 240663.0, + "reward": 1.166100013256073, + "reward_std": 0.27513332962989806, + "rewards/reward_func/mean": 1.166100013256073, + "rewards/reward_func/std": 0.275133341550827, + "step": 75, + "step_time": 15.653593644002104, + "tools/call_frequency": 3.15, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 250.6, + "completions/max_terminated_length": 235.4, + "completions/mean_length": 217.2, + "completions/mean_terminated_length": 209.23333740234375, + "completions/min_length": 169.8, + "completions/min_terminated_length": 174.0, + "entropy": 0.18624852728098631, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9008662700653076, + "kl": 0.19779104925692081, + "learning_rate": 3.4200000000000007e-06, + "loss": -0.060715597867965695, + "num_tokens": 257232.0, + "reward": 1.094600009918213, + "reward_std": 0.533681058883667, + "rewards/reward_func/mean": 1.094600009918213, + "rewards/reward_func/std": 0.5336810708045959, + "step": 80, + "step_time": 16.87674882839783, + "tools/call_frequency": 2.7, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8, + "completions/max_length": 263.2, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 238.9, + "completions/mean_terminated_length": 138.2, + "completions/min_length": 220.0, + "completions/min_terminated_length": 127.4, + "entropy": 0.06761846686713398, + "epoch": 0.53125, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0583312027156353, + "kl": 0.16298045124858618, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.0317715585231781, + "num_tokens": 274377.0, + "reward": 1.168333351612091, + "reward_std": 0.21399999260902405, + "rewards/reward_func/mean": 1.168333351612091, + "rewards/reward_func/std": 0.214000004529953, + "step": 85, + "step_time": 19.253501980405417, + "tools/call_frequency": 2.45, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9, + "completions/max_length": 252.0, + "completions/max_terminated_length": 79.6, + "completions/mean_length": 229.35, + "completions/mean_terminated_length": 79.6, + "completions/min_length": 212.6, + "completions/min_terminated_length": 79.6, + "entropy": 0.04304317501373589, + "epoch": 0.5625, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.05765737593173981, + "kl": 0.1589741975069046, + "learning_rate": 3.2200000000000005e-06, + "loss": -0.009884151071310044, + "num_tokens": 291640.0, + "reward": 1.0771000266075135, + "reward_std": 0.2571271777153015, + "rewards/reward_func/mean": 1.0771000266075135, + "rewards/reward_func/std": 0.257127183675766, + "step": 90, + "step_time": 19.810263851404308, + "tools/call_frequency": 2.7, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 252.2, + "completions/max_terminated_length": 155.6, + "completions/mean_length": 227.45, + "completions/mean_terminated_length": 154.7, + "completions/min_length": 202.6, + "completions/min_terminated_length": 153.8, + "entropy": 0.03856636304408312, + "epoch": 0.59375, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.2899415493011475, + "kl": 0.18391469195485116, + "learning_rate": 3.12e-06, + "loss": 0.012278559803962707, + "num_tokens": 308671.0, + "reward": 0.9493666887283325, + "reward_std": 0.3057107627391815, + "rewards/reward_func/mean": 0.9493666887283325, + "rewards/reward_func/std": 0.3057107746601105, + "step": 95, + "step_time": 18.270148772597896, + "tools/call_frequency": 2.75, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 229.0, + "completions/max_terminated_length": 128.6, + "completions/mean_length": 210.05, + "completions/mean_terminated_length": 121.23333435058593, + "completions/min_length": 191.4, + "completions/min_terminated_length": 114.6, + "entropy": 0.03718785918317735, + "epoch": 0.625, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.4016427993774414, + "kl": 0.19276840873062612, + "learning_rate": 3.0200000000000003e-06, + "loss": -0.02043401300907135, + "num_tokens": 325246.0, + "reward": 0.9758000135421753, + "reward_std": 0.439729905128479, + "rewards/reward_func/mean": 0.9758000135421753, + "rewards/reward_func/std": 0.439729905128479, + "step": 100, + "step_time": 16.536685503809714, + "tools/call_frequency": 3.4, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 231.0, + "completions/max_terminated_length": 175.6, + "completions/mean_length": 209.7, + "completions/mean_terminated_length": 164.73333435058595, + "completions/min_length": 197.2, + "completions/min_terminated_length": 156.2, + "entropy": 0.0890876273624599, + "epoch": 0.65625, + "frac_reward_zero_std": 0.6, + "grad_norm": 1.857412338256836, + "kl": 0.20793221928179265, + "learning_rate": 2.92e-06, + "loss": 0.010671529173851012, + "num_tokens": 341743.0, + "reward": 1.244200015068054, + "reward_std": 0.25437753796577456, + "rewards/reward_func/mean": 1.244200015068054, + "rewards/reward_func/std": 0.25437754988670347, + "step": 105, + "step_time": 14.550393618003, + "tools/call_frequency": 3.4, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 228.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 211.05, + "completions/mean_terminated_length": 204.6, + "completions/min_length": 194.2, + "completions/min_terminated_length": 194.2, + "entropy": 0.09650332322344184, + "epoch": 0.6875, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5915409922599792, + "kl": 0.1943995427340269, + "learning_rate": 2.82e-06, + "loss": -0.007803649455308914, + "num_tokens": 358474.0, + "reward": 0.9213667035102844, + "reward_std": 0.48010437488555907, + "rewards/reward_func/mean": 0.9213667035102844, + "rewards/reward_func/std": 0.480104398727417, + "step": 110, + "step_time": 15.29034832160105, + "tools/call_frequency": 3.3, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 244.8, + "completions/max_terminated_length": 244.8, + "completions/mean_length": 224.4, + "completions/mean_terminated_length": 232.93333435058594, + "completions/min_length": 204.4, + "completions/min_terminated_length": 219.8, + "entropy": 0.06257005939260125, + "epoch": 0.71875, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.09643584489822388, + "kl": 0.18671961799263953, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.0009367348626255989, + "num_tokens": 375512.0, + "reward": 0.9198000192642212, + "reward_std": 0.41239041090011597, + "rewards/reward_func/mean": 0.9198000192642212, + "rewards/reward_func/std": 0.41239042282104493, + "step": 115, + "step_time": 16.68962257000094, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 238.4, + "completions/max_terminated_length": 194.2, + "completions/mean_length": 221.6, + "completions/mean_terminated_length": 185.86666870117188, + "completions/min_length": 202.6, + "completions/min_terminated_length": 174.8, + "entropy": 0.19831047160550952, + "epoch": 0.75, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.06480103731155396, + "kl": 0.2127195455133915, + "learning_rate": 2.6200000000000003e-06, + "loss": -0.002893347479403019, + "num_tokens": 392259.0, + "reward": 1.1177000164985658, + "reward_std": 0.34459384679794314, + "rewards/reward_func/mean": 1.1177000164985658, + "rewards/reward_func/std": 0.34459385871887205, + "step": 120, + "step_time": 15.74592421480629, + "tools/call_frequency": 3.1, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 249.6, + "completions/max_terminated_length": 248.4, + "completions/mean_length": 225.9, + "completions/mean_terminated_length": 238.83333435058594, + "completions/min_length": 207.6, + "completions/min_terminated_length": 229.0, + "entropy": 0.12472135615535081, + "epoch": 0.78125, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.3778189420700073, + "kl": 0.22096077986061574, + "learning_rate": 2.52e-06, + "loss": 0.018771570920944215, + "num_tokens": 409108.0, + "reward": 0.6021333426237107, + "reward_std": 0.6133833765983582, + "rewards/reward_func/mean": 0.6021333426237107, + "rewards/reward_func/std": 0.6133833885192871, + "step": 125, + "step_time": 17.135429813191877, + "tools/call_frequency": 3.0, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 242.2, + "completions/max_terminated_length": 214.8, + "completions/mean_length": 219.0, + "completions/mean_terminated_length": 209.9, + "completions/min_length": 193.2, + "completions/min_terminated_length": 203.6, + "entropy": 0.15268438071943818, + "epoch": 0.8125, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.5108072757720947, + "kl": 0.2319534882903099, + "learning_rate": 2.42e-06, + "loss": -0.013115590810775757, + "num_tokens": 425748.0, + "reward": 0.9733999967575073, + "reward_std": 0.6780932426452637, + "rewards/reward_func/mean": 0.9733999967575073, + "rewards/reward_func/std": 0.6780932545661926, + "step": 130, + "step_time": 16.86904642219888, + "tools/call_frequency": 2.9, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 238.2, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 225.05, + "completions/mean_terminated_length": 187.35, + "completions/min_length": 212.8, + "completions/min_terminated_length": 179.6, + "entropy": 0.11236475300975143, + "epoch": 0.84375, + "frac_reward_zero_std": 0.6, + "grad_norm": 2.316483736038208, + "kl": 0.18593905940651895, + "learning_rate": 2.3200000000000002e-06, + "loss": 0.003791916370391846, + "num_tokens": 442384.0, + "reward": 1.3600000143051147, + "reward_std": 0.12066323161125184, + "rewards/reward_func/mean": 1.3600000143051147, + "rewards/reward_func/std": 0.12066323161125184, + "step": 135, + "step_time": 16.34641739600629, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 242.6, + "completions/max_terminated_length": 192.6, + "completions/mean_length": 225.85, + "completions/mean_terminated_length": 187.1, + "completions/min_length": 212.6, + "completions/min_terminated_length": 181.6, + "entropy": 0.29790011094883084, + "epoch": 0.875, + "frac_reward_zero_std": 0.2, + "grad_norm": 4.7559309005737305, + "kl": 0.25753427743911744, + "learning_rate": 2.2200000000000003e-06, + "loss": 0.003042304515838623, + "num_tokens": 459098.0, + "reward": 0.8764000177383423, + "reward_std": 0.408263224363327, + "rewards/reward_func/mean": 0.8764000177383423, + "rewards/reward_func/std": 0.408263236284256, + "step": 140, + "step_time": 17.673848899203584, + "tools/call_frequency": 2.9, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 234.8, + "completions/max_terminated_length": 140.4, + "completions/mean_length": 218.5, + "completions/mean_terminated_length": 135.4, + "completions/min_length": 204.4, + "completions/min_terminated_length": 130.4, + "entropy": 0.49127169298008083, + "epoch": 0.90625, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.07470813393592834, + "kl": 0.2544811189174652, + "learning_rate": 2.12e-06, + "loss": 0.011104442924261094, + "num_tokens": 475747.0, + "reward": 1.2128000020980836, + "reward_std": 0.3457141280174255, + "rewards/reward_func/mean": 1.2128000020980836, + "rewards/reward_func/std": 0.3457141280174255, + "step": 145, + "step_time": 17.512910800203098, + "tools/call_frequency": 3.1, + "tools/failure_frequency": 0.01428571492433548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 238.4, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 219.55, + "completions/mean_terminated_length": 176.0, + "completions/min_length": 203.0, + "completions/min_terminated_length": 163.4, + "entropy": 0.3977989808190614, + "epoch": 0.9375, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.4591357707977295, + "kl": 0.26335713379085063, + "learning_rate": 2.02e-06, + "loss": 0.013294479250907898, + "num_tokens": 492437.0, + "reward": 0.9657333374023438, + "reward_std": 0.5567267656326294, + "rewards/reward_func/mean": 0.9657333374023438, + "rewards/reward_func/std": 0.5567267417907715, + "step": 150, + "step_time": 17.259096857812256, + "tools/call_frequency": 3.15, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 232.2, + "completions/max_terminated_length": 222.4, + "completions/mean_length": 209.05, + "completions/mean_terminated_length": 207.03333435058593, + "completions/min_length": 176.2, + "completions/min_terminated_length": 193.2, + "entropy": 0.5087961174082011, + "epoch": 0.96875, + "frac_reward_zero_std": 0.4, + "grad_norm": 3.283621311187744, + "kl": 0.28082513697445394, + "learning_rate": 1.9200000000000003e-06, + "loss": -0.025146692991256714, + "num_tokens": 508782.0, + "reward": 1.0452000081539154, + "reward_std": 0.44076991081237793, + "rewards/reward_func/mean": 1.0452000081539154, + "rewards/reward_func/std": 0.4407699227333069, + "step": 155, + "step_time": 16.901877696395967, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 237.6, + "completions/max_terminated_length": 182.8, + "completions/mean_length": 219.25, + "completions/mean_terminated_length": 182.6, + "completions/min_length": 204.8, + "completions/min_terminated_length": 182.4, + "entropy": 0.6035067357588559, + "epoch": 1.0, + "frac_reward_zero_std": 0.2, + "grad_norm": 3.8756589889526367, + "kl": 0.27896949015557765, + "learning_rate": 1.8200000000000002e-06, + "loss": 0.018515169620513916, + "num_tokens": 525445.0, + "reward": 0.8603333592414856, + "reward_std": 0.4929263710975647, + "rewards/reward_func/mean": 0.8603333592414856, + "rewards/reward_func/std": 0.4929263830184937, + "step": 160, + "step_time": 18.22537782279833, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8, + "completions/max_length": 226.4, + "completions/max_terminated_length": 132.6, + "completions/mean_length": 208.4, + "completions/mean_terminated_length": 113.8, + "completions/min_length": 178.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.81973907337524, + "epoch": 1.03125, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.23878704011440277, + "kl": 0.28968340829014777, + "learning_rate": 1.72e-06, + "loss": -0.01732647567987442, + "num_tokens": 541725.0, + "reward": 0.801800012588501, + "reward_std": 0.3781105220317841, + "rewards/reward_func/mean": 0.801800012588501, + "rewards/reward_func/std": 0.3781105220317841, + "step": 165, + "step_time": 17.06264049640158, + "tools/call_frequency": 3.0, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 234.2, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 210.4, + "completions/mean_terminated_length": 148.5, + "completions/min_length": 176.8, + "completions/min_terminated_length": 141.0, + "entropy": 0.689805658115074, + "epoch": 1.0625, + "frac_reward_zero_std": 0.2, + "grad_norm": 4.456592082977295, + "kl": 0.23804183304309845, + "learning_rate": 1.6200000000000002e-06, + "loss": -0.05976427793502807, + "num_tokens": 558245.0, + "reward": 0.8562000155448913, + "reward_std": 0.633456540107727, + "rewards/reward_func/mean": 0.8562000155448913, + "rewards/reward_func/std": 0.633456540107727, + "step": 170, + "step_time": 18.225926614992204, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.018181818723678588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 226.2, + "completions/max_terminated_length": 168.2, + "completions/mean_length": 210.85, + "completions/mean_terminated_length": 160.45, + "completions/min_length": 196.2, + "completions/min_terminated_length": 155.4, + "entropy": 0.5438663156237453, + "epoch": 1.09375, + "frac_reward_zero_std": 0.2, + "grad_norm": 4.7369794845581055, + "kl": 0.27658827155828475, + "learning_rate": 1.52e-06, + "loss": 0.017269280552864075, + "num_tokens": 574485.0, + "reward": 1.1061000227928162, + "reward_std": 0.29246323108673095, + "rewards/reward_func/mean": 1.1061000227928162, + "rewards/reward_func/std": 0.29246323108673095, + "step": 175, + "step_time": 15.657095247798134, + "tools/call_frequency": 3.25, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 227.6, + "completions/max_terminated_length": 153.8, + "completions/mean_length": 210.05, + "completions/mean_terminated_length": 150.3, + "completions/min_length": 182.4, + "completions/min_terminated_length": 146.8, + "entropy": 0.45502517083659766, + "epoch": 1.125, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.233670473098755, + "kl": 0.24543451368808747, + "learning_rate": 1.42e-06, + "loss": -0.03468523025512695, + "num_tokens": 591329.0, + "reward": 0.9866333484649659, + "reward_std": 0.43823108077049255, + "rewards/reward_func/mean": 0.9866333484649659, + "rewards/reward_func/std": 0.43823106288909913, + "step": 180, + "step_time": 16.63430393260496, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.018181818723678588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 238.8, + "completions/max_terminated_length": 235.8, + "completions/mean_length": 214.35, + "completions/mean_terminated_length": 215.93333740234374, + "completions/min_length": 179.6, + "completions/min_terminated_length": 192.4, + "entropy": 0.3232035285793245, + "epoch": 1.15625, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.042836207896471024, + "kl": 0.2335926942527294, + "learning_rate": 1.32e-06, + "loss": -0.053522664308547976, + "num_tokens": 607959.0, + "reward": 1.1267000198364259, + "reward_std": 0.5779333353042603, + "rewards/reward_func/mean": 1.1267000198364259, + "rewards/reward_func/std": 0.5779333412647247, + "step": 185, + "step_time": 16.85379633680277, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 234.8, + "completions/max_terminated_length": 162.2, + "completions/mean_length": 214.2, + "completions/mean_terminated_length": 153.75, + "completions/min_length": 188.8, + "completions/min_terminated_length": 147.4, + "entropy": 0.6248208525590598, + "epoch": 1.1875, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5707623958587646, + "kl": 0.26286336220800877, + "learning_rate": 1.2200000000000002e-06, + "loss": -0.04159345626831055, + "num_tokens": 624606.0, + "reward": 0.7690666794776917, + "reward_std": 0.5251105308532715, + "rewards/reward_func/mean": 0.7690666794776917, + "rewards/reward_func/std": 0.5251105427742004, + "step": 190, + "step_time": 15.895430696196854, + "tools/call_frequency": 2.95, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 231.2, + "completions/max_terminated_length": 140.8, + "completions/mean_length": 212.5, + "completions/mean_terminated_length": 130.53333435058593, + "completions/min_length": 182.6, + "completions/min_terminated_length": 113.6, + "entropy": 1.0036215352360158, + "epoch": 1.21875, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.07126332074403763, + "kl": 0.2945852160453796, + "learning_rate": 1.12e-06, + "loss": 0.08146535158157349, + "num_tokens": 641178.0, + "reward": 0.7420000076293946, + "reward_std": 0.2832352787256241, + "rewards/reward_func/mean": 0.7420000076293946, + "rewards/reward_func/std": 0.28323529064655306, + "step": 195, + "step_time": 16.828464100402197, + "tools/call_frequency": 2.9, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8, + "completions/max_length": 228.2, + "completions/max_terminated_length": 139.8, + "completions/mean_length": 217.85, + "completions/mean_terminated_length": 138.8, + "completions/min_length": 205.4, + "completions/min_terminated_length": 137.8, + "entropy": 0.7422479507047683, + "epoch": 1.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.2798051834106445, + "kl": 0.3032657243311405, + "learning_rate": 1.02e-06, + "loss": 0.017710180580616, + "num_tokens": 657673.0, + "reward": 0.8391000092029571, + "reward_std": 0.7726872444152832, + "rewards/reward_func/mean": 0.8391000092029571, + "rewards/reward_func/std": 0.7726872682571411, + "step": 200, + "step_time": 17.576660793198972, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 221.8, + "completions/max_terminated_length": 146.4, + "completions/mean_length": 207.7, + "completions/mean_terminated_length": 143.7, + "completions/min_length": 176.2, + "completions/min_terminated_length": 141.0, + "entropy": 0.28722639731131494, + "epoch": 1.28125, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.05100347474217415, + "kl": 0.23604922145605087, + "learning_rate": 9.200000000000001e-07, + "loss": -0.04504288733005524, + "num_tokens": 673877.0, + "reward": 1.34099999666214, + "reward_std": 0.09399999976158142, + "rewards/reward_func/mean": 1.34099999666214, + "rewards/reward_func/std": 0.09399999976158142, + "step": 205, + "step_time": 17.11068033759657, + "tools/call_frequency": 2.95, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 238.2, + "completions/max_terminated_length": 176.8, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 175.9, + "completions/min_length": 206.6, + "completions/min_terminated_length": 175.0, + "entropy": 0.7828503699507564, + "epoch": 1.3125, + "frac_reward_zero_std": 0.4, + "grad_norm": 3.9798154830932617, + "kl": 0.2740111470222473, + "learning_rate": 8.200000000000001e-07, + "loss": 0.008357369899749756, + "num_tokens": 690789.0, + "reward": 1.0448000192642213, + "reward_std": 0.33592903017997744, + "rewards/reward_func/mean": 1.0448000192642213, + "rewards/reward_func/std": 0.33592904210090635, + "step": 210, + "step_time": 16.50285193720192, + "tools/call_frequency": 3.0, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 234.6, + "completions/max_terminated_length": 103.8, + "completions/mean_length": 204.35, + "completions/mean_terminated_length": 91.8, + "completions/min_length": 156.4, + "completions/min_terminated_length": 79.8, + "entropy": 0.8003528213594109, + "epoch": 1.34375, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.159558296203613, + "kl": 0.2852329473942518, + "learning_rate": 7.2e-07, + "loss": -0.08619436025619506, + "num_tokens": 707222.0, + "reward": 1.0130333423614502, + "reward_std": 0.6909683525562287, + "rewards/reward_func/mean": 1.0130333423614502, + "rewards/reward_func/std": 0.6909683525562287, + "step": 215, + "step_time": 17.172416884609266, + "tools/call_frequency": 2.75, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 227.6, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 210.55, + "completions/mean_terminated_length": 191.4, + "completions/min_length": 179.4, + "completions/min_terminated_length": 188.8, + "entropy": 0.5576354823075235, + "epoch": 1.375, + "frac_reward_zero_std": 0.4, + "grad_norm": 3.594050168991089, + "kl": 0.278278523683548, + "learning_rate": 6.200000000000001e-07, + "loss": -0.04620848298072815, + "num_tokens": 723693.0, + "reward": 1.0568666815757752, + "reward_std": 0.4007272839546204, + "rewards/reward_func/mean": 1.0568666815757752, + "rewards/reward_func/std": 0.4007272839546204, + "step": 220, + "step_time": 16.7619311846036, + "tools/call_frequency": 2.9, + "tools/failure_frequency": 0.01666666716337204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 238.8, + "completions/max_terminated_length": 201.4, + "completions/mean_length": 202.6, + "completions/mean_terminated_length": 174.96666870117187, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.8566485311836004, + "epoch": 1.40625, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.667319297790527, + "kl": 0.2987564954906702, + "learning_rate": 5.2e-07, + "loss": -0.07293931245803834, + "num_tokens": 740154.0, + "reward": 0.9942000150680542, + "reward_std": 0.6604385733604431, + "rewards/reward_func/mean": 0.9942000150680542, + "rewards/reward_func/std": 0.6604385554790497, + "step": 225, + "step_time": 16.59093914159166, + "tools/call_frequency": 2.9, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8, + "completions/max_length": 230.2, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 205.65, + "completions/mean_terminated_length": 118.0, + "completions/min_length": 155.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.6852329360786825, + "epoch": 1.4375, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.13374075293540955, + "kl": 0.27822317034006117, + "learning_rate": 4.2000000000000006e-07, + "loss": -0.07140348553657531, + "num_tokens": 756746.0, + "reward": 1.137933337688446, + "reward_std": 0.24845077395439147, + "rewards/reward_func/mean": 1.137933337688446, + "rewards/reward_func/std": 0.24845077395439147, + "step": 230, + "step_time": 16.51037070681341, + "tools/call_frequency": 2.65, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 229.8, + "completions/max_terminated_length": 215.8, + "completions/mean_length": 214.75, + "completions/mean_terminated_length": 210.7, + "completions/min_length": 204.4, + "completions/min_terminated_length": 206.6, + "entropy": 0.8697535959538072, + "epoch": 1.46875, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.334895133972168, + "kl": 0.2648452676832676, + "learning_rate": 3.2e-07, + "loss": 0.014858978986740112, + "num_tokens": 773619.0, + "reward": 0.7043333411216736, + "reward_std": 0.402424693107605, + "rewards/reward_func/mean": 0.7043333411216736, + "rewards/reward_func/std": 0.402424693107605, + "step": 235, + "step_time": 16.278191728607634, + "tools/call_frequency": 3.2, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 234.0, + "completions/max_terminated_length": 148.6, + "completions/mean_length": 207.45, + "completions/mean_terminated_length": 142.8, + "completions/min_length": 165.0, + "completions/min_terminated_length": 137.0, + "entropy": 1.037268871301785, + "epoch": 1.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.430893421173096, + "kl": 0.2893723286688328, + "learning_rate": 2.2e-07, + "loss": -0.0661659598350525, + "num_tokens": 789973.0, + "reward": 0.7955000281333924, + "reward_std": 0.6784387767314911, + "rewards/reward_func/mean": 0.7955000281333924, + "rewards/reward_func/std": 0.6784387946128845, + "step": 240, + "step_time": 17.391802603405086, + "tools/call_frequency": 3.05, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8, + "completions/max_length": 231.2, + "completions/max_terminated_length": 138.6, + "completions/mean_length": 219.2, + "completions/mean_terminated_length": 136.3, + "completions/min_length": 211.2, + "completions/min_terminated_length": 134.0, + "entropy": 0.47988032912835477, + "epoch": 1.53125, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.18447190523147583, + "kl": 0.23510150127112867, + "learning_rate": 1.2000000000000002e-07, + "loss": 0.005726324021816253, + "num_tokens": 806865.0, + "reward": 1.1196000099182128, + "reward_std": 0.15212990045547486, + "rewards/reward_func/mean": 1.1196000099182128, + "rewards/reward_func/std": 0.15212990045547486, + "step": 245, + "step_time": 17.48798955640523, + "tools/call_frequency": 3.0, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 224.4, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 210.15, + "completions/mean_terminated_length": 167.0, + "completions/min_length": 194.4, + "completions/min_terminated_length": 164.0, + "entropy": 0.7179721655789763, + "epoch": 1.5625, + "frac_reward_zero_std": 0.2, + "grad_norm": 4.354219436645508, + "kl": 0.26825664229691026, + "learning_rate": 2e-08, + "loss": 0.0007836699485778808, + "num_tokens": 823319.0, + "reward": 0.9574000000953674, + "reward_std": 0.6371999979019165, + "rewards/reward_func/mean": 0.9574000000953674, + "rewards/reward_func/std": 0.6371999979019165, + "step": 250, + "step_time": 17.161368461008532, + "tools/call_frequency": 3.3, + "tools/failure_frequency": 0.0 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 823319, + "num_train_epochs": 2, + "save_steps": 125, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-250/training_args.bin b/checkpoint-250/training_args.bin new file mode 100644 index 0000000..2f7d0b0 --- /dev/null +++ b/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:623158c464bdeaec4fa82a52817596e39771e95ece6659a737aa9e9d07d03327 +size 7185 diff --git a/completions/completions_00005.parquet b/completions/completions_00005.parquet new file mode 100644 index 0000000..c988b32 --- /dev/null +++ b/completions/completions_00005.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b68e115f077b56d92a8687696043f68cf4d262206fbbc1ceed637b932511d25b +size 23966 diff --git a/completions/completions_00010.parquet b/completions/completions_00010.parquet new file mode 100644 index 0000000..3f51773 --- /dev/null +++ b/completions/completions_00010.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3069f8445a69ba6dbb4859ea555ae1502c389e67a54572235e71ad1d67d588a +size 21823 diff --git a/completions/completions_00015.parquet b/completions/completions_00015.parquet new file mode 100644 index 0000000..d2c82a5 --- /dev/null +++ b/completions/completions_00015.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1b7eb25a9ada0e3bcbe12c628fde6b177a446a889be8e5681f37c1feeb87f0d +size 22907 diff --git a/completions/completions_00020.parquet b/completions/completions_00020.parquet new file mode 100644 index 0000000..f5c3bd3 --- /dev/null +++ b/completions/completions_00020.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3fd829f54108e3882389b9e442c07a35d5d07e0367dc3be83d302a465ac1111 +size 23546 diff --git a/completions/completions_00025.parquet b/completions/completions_00025.parquet new file mode 100644 index 0000000..763882c --- /dev/null +++ b/completions/completions_00025.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60b6b7b89e0297abd8244bdc13e7a75d60b34766299cfd9db3de5f9b25373dad +size 24284 diff --git a/completions/completions_00030.parquet b/completions/completions_00030.parquet new file mode 100644 index 0000000..ae65ecc --- /dev/null +++ b/completions/completions_00030.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8be5f615ecd18a856e38c4fc12d3f16df7e6f9fb2d4f573c6db0d11eb4e5d8 +size 24890 diff --git a/completions/completions_00035.parquet b/completions/completions_00035.parquet new file mode 100644 index 0000000..99f6e6f --- /dev/null +++ b/completions/completions_00035.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d02e8831c4091b1dbd8b0e07448cd8f22de4b3efe52ba57d6f6b6f311fa9560c +size 24136 diff --git a/completions/completions_00040.parquet b/completions/completions_00040.parquet new file mode 100644 index 0000000..b6ae34d --- /dev/null +++ b/completions/completions_00040.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d20e76df2ff675580ce50d9bcb6fd4e6225342d09303f180762cabab1e24d09c +size 23457 diff --git a/completions/completions_00045.parquet b/completions/completions_00045.parquet new file mode 100644 index 0000000..79b7e2e --- /dev/null +++ b/completions/completions_00045.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42b35b140b93dfe80a48387ebaff03cfe2ae1af27020dbf01ed2b2ce92dc464 +size 24536 diff --git a/completions/completions_00050.parquet b/completions/completions_00050.parquet new file mode 100644 index 0000000..d5e9f33 --- /dev/null +++ b/completions/completions_00050.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10826040482a7a035f66f510420b4737fc33944af16909a4a04148272a4158af +size 23472 diff --git a/completions/completions_00055.parquet b/completions/completions_00055.parquet new file mode 100644 index 0000000..da4a7bb --- /dev/null +++ b/completions/completions_00055.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6873b01aeb8204e7742e55ce05030ae49384e5730f49dfed84096525919bea00 +size 24304 diff --git a/completions/completions_00060.parquet b/completions/completions_00060.parquet new file mode 100644 index 0000000..b830362 --- /dev/null +++ b/completions/completions_00060.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de8626c9427a3c1b2cc33599dfc77755310fb356c4960e0f3e2975351aa7225b +size 22618 diff --git a/completions/completions_00065.parquet b/completions/completions_00065.parquet new file mode 100644 index 0000000..2a8ecb2 --- /dev/null +++ b/completions/completions_00065.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:696b5af00da3c47c2729346a733fb873d978694c2a5959ecdee947d051cfe54a +size 24365 diff --git a/completions/completions_00070.parquet b/completions/completions_00070.parquet new file mode 100644 index 0000000..631eda6 --- /dev/null +++ b/completions/completions_00070.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0874277f72d414e338612ec8eadd8acd4aea6f03be1582c71194bf838b69db1 +size 24000 diff --git a/completions/completions_00075.parquet b/completions/completions_00075.parquet new file mode 100644 index 0000000..f01db24 --- /dev/null +++ b/completions/completions_00075.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08a237f024f5f74a46c8bcced98366ccda0e7630e8b479c1df3bbdfb6851b4e9 +size 24879 diff --git a/completions/completions_00080.parquet b/completions/completions_00080.parquet new file mode 100644 index 0000000..bf75e3b --- /dev/null +++ b/completions/completions_00080.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4850fbf4fbb8b1944b6c971e91da5f510681ad9664e35e56828e9b50cea0db0e +size 24416 diff --git a/completions/completions_00085.parquet b/completions/completions_00085.parquet new file mode 100644 index 0000000..ae564a7 --- /dev/null +++ b/completions/completions_00085.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:072d189c7cc2ca7e12abaf7cab8175b6d8d5dd4f93a34aeddcfb9f3630584163 +size 24320 diff --git a/completions/completions_00090.parquet b/completions/completions_00090.parquet new file mode 100644 index 0000000..3927210 --- /dev/null +++ b/completions/completions_00090.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30acbe037313f6f3ffae9cb3d097d58f86c5c4df5a3455a5613e8a120fe28ff3 +size 24495 diff --git a/completions/completions_00095.parquet b/completions/completions_00095.parquet new file mode 100644 index 0000000..f670cf4 --- /dev/null +++ b/completions/completions_00095.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1ac198276f2647c859589db083a8bc23b707dd3e780deee44591fdbeae2d821 +size 24939 diff --git a/completions/completions_00100.parquet b/completions/completions_00100.parquet new file mode 100644 index 0000000..244f22f --- /dev/null +++ b/completions/completions_00100.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbfbcd01e3e49dc20f3c4ec6f5bae29692d9ff1932d27696cd773a9eaa31f733 +size 23622 diff --git a/completions/completions_00105.parquet b/completions/completions_00105.parquet new file mode 100644 index 0000000..660d296 --- /dev/null +++ b/completions/completions_00105.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca57fb90811d63b27b46fa9f9b87e40101195a39f726df56b9ca70043357f768 +size 24260 diff --git a/completions/completions_00110.parquet b/completions/completions_00110.parquet new file mode 100644 index 0000000..9ee0c22 --- /dev/null +++ b/completions/completions_00110.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d834799c7968349955dfdeadd84971539773c304f681ac3d1632477d39bd47e +size 24949 diff --git a/completions/completions_00115.parquet b/completions/completions_00115.parquet new file mode 100644 index 0000000..6013060 --- /dev/null +++ b/completions/completions_00115.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef13f61defd07f329d24be7ac6761b10e5175729fae5db04e1373ef3bb8389b6 +size 24970 diff --git a/completions/completions_00120.parquet b/completions/completions_00120.parquet new file mode 100644 index 0000000..9e44524 --- /dev/null +++ b/completions/completions_00120.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11d282244295c8c553d003ec90504d91d25855b55e6b8260ab930ac1f081ba2 +size 24570 diff --git a/completions/completions_00125.parquet b/completions/completions_00125.parquet new file mode 100644 index 0000000..dc748c6 --- /dev/null +++ b/completions/completions_00125.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b8f35966c5bc1c9fd9fe0d3ce6cd8eb9c8b65395f7aae5182d67c4f08f0ab54 +size 25114 diff --git a/completions/completions_00130.parquet b/completions/completions_00130.parquet new file mode 100644 index 0000000..045cad5 --- /dev/null +++ b/completions/completions_00130.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b54d548b96ad67e2cf580dd46b0cee2f707f7d117741ed1e93b9b2cb0e457f +size 23209 diff --git a/completions/completions_00135.parquet b/completions/completions_00135.parquet new file mode 100644 index 0000000..2caf020 --- /dev/null +++ b/completions/completions_00135.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:749f829e0806c9f9432492597b18573f0ff93c4f3f30c0745b6354de72518c60 +size 24387 diff --git a/completions/completions_00140.parquet b/completions/completions_00140.parquet new file mode 100644 index 0000000..5d8f5e0 --- /dev/null +++ b/completions/completions_00140.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b92f197fe307553295b686c5b5bd8a43322139886c0cedef75665de87f66c440 +size 24489 diff --git a/completions/completions_00145.parquet b/completions/completions_00145.parquet new file mode 100644 index 0000000..36a02af --- /dev/null +++ b/completions/completions_00145.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7028e21d8f6dd9e32cb076cf66228d646906fb904c70ed84f80426d786ec5ed +size 25681 diff --git a/completions/completions_00150.parquet b/completions/completions_00150.parquet new file mode 100644 index 0000000..c23ad11 --- /dev/null +++ b/completions/completions_00150.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da2ca6a5fe353e3c7e435c775fb487f923367b9a2e5769605628185afc7d3ba0 +size 25240 diff --git a/completions/completions_00155.parquet b/completions/completions_00155.parquet new file mode 100644 index 0000000..0a94acb --- /dev/null +++ b/completions/completions_00155.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0690b76b28598fa815fe1f1c312144318210684badc5711a60d8e929154616cd +size 23230 diff --git a/completions/completions_00160.parquet b/completions/completions_00160.parquet new file mode 100644 index 0000000..b2ba2ed --- /dev/null +++ b/completions/completions_00160.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fc270c3a3ed5a7bde9eeb02321c46c616e15042f917a763be2705a1fa80bad9 +size 24247 diff --git a/completions/completions_00165.parquet b/completions/completions_00165.parquet new file mode 100644 index 0000000..79497b2 --- /dev/null +++ b/completions/completions_00165.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae2c4bcabbb25bb7e70be2a3c27fab24a0cd675f11a15fe9361d4e73e75c0f3 +size 24118 diff --git a/completions/completions_00170.parquet b/completions/completions_00170.parquet new file mode 100644 index 0000000..55dfb5c --- /dev/null +++ b/completions/completions_00170.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e954edee445dfed87e8c114b9c0b52f02ebdb8fbe9096b3b150a21689f391c4 +size 24839 diff --git a/completions/completions_00175.parquet b/completions/completions_00175.parquet new file mode 100644 index 0000000..cf1f81a --- /dev/null +++ b/completions/completions_00175.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a8c8730b512d568a457f95a12eaab65511d8bcc5a6fbb6df79e7f9919aaac8d +size 25469 diff --git a/completions/completions_00180.parquet b/completions/completions_00180.parquet new file mode 100644 index 0000000..7cdba2e --- /dev/null +++ b/completions/completions_00180.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8d4b6a551b0c66d356a5919397b6cfe8da2453e193cd12cc77c925ad2a7c940 +size 24879 diff --git a/completions/completions_00185.parquet b/completions/completions_00185.parquet new file mode 100644 index 0000000..034efef --- /dev/null +++ b/completions/completions_00185.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af62c422876a65648f0c8357d7aaea0605ff56d7b7d329e657eeeb49310a8cab +size 25086 diff --git a/completions/completions_00190.parquet b/completions/completions_00190.parquet new file mode 100644 index 0000000..86cf4f5 --- /dev/null +++ b/completions/completions_00190.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:104bdb0177fb32febf5ad69d99c089aef23e0cdf858fef9f51f6837975877e94 +size 24995 diff --git a/completions/completions_00195.parquet b/completions/completions_00195.parquet new file mode 100644 index 0000000..d492a83 --- /dev/null +++ b/completions/completions_00195.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dad7982eb1657e24c2d30b56e8065fbe658ee1bbcbb3825273cadcbd278bf1 +size 23769 diff --git a/completions/completions_00200.parquet b/completions/completions_00200.parquet new file mode 100644 index 0000000..e05d9c1 --- /dev/null +++ b/completions/completions_00200.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b88568b249769abd964dccb3a029392ff4188a6f2e2e0d61e49e45b8734cb3 +size 24790 diff --git a/completions/completions_00205.parquet b/completions/completions_00205.parquet new file mode 100644 index 0000000..5298d90 --- /dev/null +++ b/completions/completions_00205.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ce7ed5ff6eda7d382bade212a53fcb422b3e192e220ce3758f753cf75259c8d +size 23958 diff --git a/completions/completions_00210.parquet b/completions/completions_00210.parquet new file mode 100644 index 0000000..de4f423 --- /dev/null +++ b/completions/completions_00210.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b48165efd963ddf763d266c16377a9f1a10ac8d1ea4ab6fec47be41ca39897 +size 26131 diff --git a/completions/completions_00215.parquet b/completions/completions_00215.parquet new file mode 100644 index 0000000..d842a94 --- /dev/null +++ b/completions/completions_00215.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c0ccb348d1be8c1e0a1b9287c1f7be4d87d75b59f14bcce50147d6c0970d71d +size 23995 diff --git a/completions/completions_00220.parquet b/completions/completions_00220.parquet new file mode 100644 index 0000000..7b6b210 --- /dev/null +++ b/completions/completions_00220.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09658361715b206e755041e17a00d8be401b27d49b5fb02665d2dd406a7dca14 +size 25117 diff --git a/completions/completions_00225.parquet b/completions/completions_00225.parquet new file mode 100644 index 0000000..79f1e51 --- /dev/null +++ b/completions/completions_00225.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e73eda9b2dfe5b8e3e99ca7669bf7f028a5afdd65e64b08432831449b4a8d97 +size 25627 diff --git a/completions/completions_00230.parquet b/completions/completions_00230.parquet new file mode 100644 index 0000000..fea9524 --- /dev/null +++ b/completions/completions_00230.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14aed69014783ad4eb182afdbae30c06cbe8da270bd87a6a70382d3aa8870fdf +size 24619 diff --git a/completions/completions_00235.parquet b/completions/completions_00235.parquet new file mode 100644 index 0000000..ccf09fb --- /dev/null +++ b/completions/completions_00235.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49fcedabb97e8a9db277be5a8442ff7a4bdf53c9dbb09f1081e8f1d656cd7f9c +size 25408 diff --git a/completions/completions_00240.parquet b/completions/completions_00240.parquet new file mode 100644 index 0000000..92418f6 --- /dev/null +++ b/completions/completions_00240.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54a2bc4b721e6d58161fe6f2405d5861c5a7aa33ecad498dc3923ce85a84f6b8 +size 25454 diff --git a/completions/completions_00245.parquet b/completions/completions_00245.parquet new file mode 100644 index 0000000..ef8adf0 --- /dev/null +++ b/completions/completions_00245.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ce3a8098ceb8361a7bdb1d7a3a2f0cbc26e3876257dca9219ae45d2a8cd2eb1 +size 24477 diff --git a/completions/completions_00250.parquet b/completions/completions_00250.parquet new file mode 100644 index 0000000..92ca3ac --- /dev/null +++ b/completions/completions_00250.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e999027fab4f80f916ee346b91deeab0d7bf5d6c537c2ecbcfe706ea807b7de +size 25162 diff --git a/config.json b/config.json new file mode 100644 index 0000000..9b2e878 --- /dev/null +++ b/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.6.2", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/eval/trained_eval_rows.csv b/eval/trained_eval_rows.csv new file mode 100644 index 0000000..32ac4ae --- /dev/null +++ b/eval/trained_eval_rows.csv @@ -0,0 +1,151 @@ +agent,seed,case_id,difficulty,reward,primary_reward,auxiliary_reward,contradictions_total,contradictions_triggered,contradictions_surfaced,questions_used,evidence_presented,evidence_timing_successes,blind_evidence_count,useless_questions_ratio,avg_question_length,model_repo,invalid_tool_calls +random,20260425,timeline_255d67,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,, +random,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,, +random,20260427,workplace_c98377,easy,0.0,0.0,-0.4,1,0,0,3,5,0,5,1.0,5.0,, +random,20260428,motive_66ff59,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,, +random,20260429,timeline_19bb78,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,, +random,20260430,timeline_a97690,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,, +random,20260431,alibi_67ffcd,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,, +random,20260432,alibi_423bca,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260433,knowledge_960d07,medium,0.0,0.0,-0.4,2,0,0,8,0,0,0,1.0,5.0,, +random,20260434,alibi_e829c1,easy,0.0,0.0,-0.4,1,0,0,7,1,0,1,1.0,5.0,, +random,20260435,motive_85e25b,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,, +random,20260436,knowledge_a599e3,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,, +random,20260437,motive_8bca20,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,, +random,20260438,corporate_6b1664,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,, +random,20260439,alibi_a6c582,easy,0.0,0.0,-0.4,1,0,0,8,0,0,0,1.0,5.0,, +random,20260440,workplace_835476,easy,0.0,0.0,-0.4,1,0,0,5,3,0,3,1.0,5.0,, +random,20260441,possession_a079c5,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,, +random,20260442,possession_9cc45d,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,, +random,20260443,possession_259aa5,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,, +random,20260444,corporate_76724c,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260445,timeline_767821,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260446,motive_c0d166,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260447,corporate_307934,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,, +random,20260448,timeline_592816,hard,0.0,0.0,-0.4,3,0,0,6,2,0,2,1.0,5.0,, +random,20260449,knowledge_b26824,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260450,knowledge_697785,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,, +random,20260451,timeline_81dafd,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,, +random,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,, +random,20260453,possession_dbb5fe,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260454,alibi_a4666f,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,, +keyword_spam,20260425,timeline_255d67,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260426,knowledge_b28f8c,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260427,workplace_c98377,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260428,motive_66ff59,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260429,timeline_19bb78,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260430,timeline_a97690,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260431,alibi_67ffcd,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260432,alibi_423bca,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260433,knowledge_960d07,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260434,alibi_e829c1,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260435,motive_85e25b,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260436,knowledge_a599e3,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260437,motive_8bca20,easy,0.0,0.0,-0.15000000000000002,1,0,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260438,corporate_6b1664,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260439,alibi_a6c582,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260440,workplace_835476,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260441,possession_a079c5,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260442,possession_9cc45d,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260444,corporate_76724c,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260445,timeline_767821,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260446,motive_c0d166,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260447,corporate_307934,hard,0.020000000000000007,0.0,0.10000000000000003,3,1,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260448,timeline_592816,hard,0.19,0.0,0.95,3,3,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260449,knowledge_b26824,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260450,knowledge_697785,hard,0.12000000000000002,0.0,0.6000000000000001,3,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260451,timeline_81dafd,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260452,corporate_8eb7d7,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260453,possession_dbb5fe,medium,0.030000000000000006,0.0,0.15000000000000002,2,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260454,alibi_a4666f,hard,0.15000000000000002,0.0,0.75,3,2,0,5,0,0,0,0.2,4.2,, +present_all,20260425,timeline_255d67,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260427,workplace_c98377,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260428,motive_66ff59,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,, +present_all,20260429,timeline_19bb78,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260430,timeline_a97690,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260431,alibi_67ffcd,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260432,alibi_423bca,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260433,knowledge_960d07,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260434,alibi_e829c1,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,, +present_all,20260435,motive_85e25b,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,, +present_all,20260436,knowledge_a599e3,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260437,motive_8bca20,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260438,corporate_6b1664,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260439,alibi_a6c582,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,, +present_all,20260440,workplace_835476,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260441,possession_a079c5,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,, +present_all,20260442,possession_9cc45d,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,, +present_all,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,, +present_all,20260444,corporate_76724c,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260445,timeline_767821,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260446,motive_c0d166,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260447,corporate_307934,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,, +present_all,20260448,timeline_592816,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,, +present_all,20260449,knowledge_b26824,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260450,knowledge_697785,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,, +present_all,20260451,timeline_81dafd,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260453,possession_dbb5fe,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260454,alibi_a4666f,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,, +scripted_oracle,20260425,timeline_255d67,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260426,knowledge_b28f8c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,, +scripted_oracle,20260427,workplace_c98377,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260428,motive_66ff59,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260429,timeline_19bb78,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260430,timeline_a97690,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,, +scripted_oracle,20260431,alibi_67ffcd,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,, +scripted_oracle,20260432,alibi_423bca,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,, +scripted_oracle,20260433,knowledge_960d07,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,, +scripted_oracle,20260434,alibi_e829c1,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,, +scripted_oracle,20260435,motive_85e25b,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260436,knowledge_a599e3,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,, +scripted_oracle,20260437,motive_8bca20,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260438,corporate_6b1664,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260439,alibi_a6c582,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,, +scripted_oracle,20260440,workplace_835476,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260441,possession_a079c5,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260442,possession_9cc45d,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260443,possession_259aa5,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260444,corporate_76724c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260445,timeline_767821,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,, +scripted_oracle,20260446,motive_c0d166,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260447,corporate_307934,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260448,timeline_592816,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260449,knowledge_b26824,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,, +scripted_oracle,20260450,knowledge_697785,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.6666666666666667,, +scripted_oracle,20260451,timeline_81dafd,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,, +scripted_oracle,20260452,corporate_8eb7d7,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260453,possession_dbb5fe,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260454,alibi_a4666f,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,2.3333333333333335,, +trained_sft_grpo_run2,20260425,timeline_255d67,easy,0.010000000000000004,0.0,0.05000000000000002,1,1,0,7,0,0,0,0.7142857142857143,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260426,knowledge_b28f8c,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,45.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260427,workplace_c98377,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,31.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260428,motive_66ff59,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,31.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260429,timeline_19bb78,easy,0.010000000000000004,0.0,0.05000000000000002,1,1,0,7,0,0,0,0.7142857142857143,31.428571428571427,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260430,timeline_a97690,medium,0.010000000000000004,0.0,0.05000000000000002,2,1,0,7,0,0,0,0.7142857142857143,36.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260431,alibi_67ffcd,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,35.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260432,alibi_423bca,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260433,knowledge_960d07,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260434,alibi_e829c1,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,37.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260435,motive_85e25b,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,32.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260436,knowledge_a599e3,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260437,motive_8bca20,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,30.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260438,corporate_6b1664,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260439,alibi_a6c582,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260440,workplace_835476,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,30.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260441,possession_a079c5,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,47.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260442,possession_9cc45d,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,43.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260443,possession_259aa5,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,42.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260444,corporate_76724c,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260445,timeline_767821,medium,0.010000000000000004,0.0,0.05000000000000002,2,1,0,7,0,0,0,0.7142857142857143,44.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260446,motive_c0d166,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,35.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260447,corporate_307934,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260448,timeline_592816,hard,0.010000000000000004,0.0,0.05000000000000002,3,1,0,7,0,0,0,0.7142857142857143,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260449,knowledge_b26824,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260450,knowledge_697785,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,43.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260451,timeline_81dafd,medium,0.010000000000000004,0.0,0.05000000000000002,2,1,0,7,0,0,0,0.7142857142857143,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260452,corporate_8eb7d7,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260453,possession_dbb5fe,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,45.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 +trained_sft_grpo_run2,20260454,alibi_a4666f,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0 diff --git a/eval/trained_eval_rows.jsonl b/eval/trained_eval_rows.jsonl new file mode 100644 index 0000000..0568b20 --- /dev/null +++ b/eval/trained_eval_rows.jsonl @@ -0,0 +1,150 @@ +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 3, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 1.0} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260425, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260426, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260428, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260429, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260430, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260431, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260432, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260433, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260434, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260435, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260436, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": -0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260438, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260439, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260441, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260442, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260444, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260445, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260446, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260447, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.95, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.19, "seed": 20260448, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260449, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260450, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260451, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260452, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260453, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.75, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.15000000000000002, "seed": 20260454, "useless_questions_ratio": 0.2} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260425, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260426, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260427, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260428, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260429, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_a97690", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260430, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260431, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260432, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260433, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260434, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260435, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260436, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260437, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260438, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260439, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260440, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260441, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260442, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260443, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260444, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_767821", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260445, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260446, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260447, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260448, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260449, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.6666666666666667, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260450, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_81dafd", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260451, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260452, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260453, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 2.3333333333333335, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260454, "useless_questions_ratio": 0.0} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260425, "useless_questions_ratio": 0.7142857142857143} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 45.0, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260426, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 31.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260427, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 31.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260428, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 31.428571428571427, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260429, "useless_questions_ratio": 0.7142857142857143} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 36.0, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260430, "useless_questions_ratio": 0.7142857142857143} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 35.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260431, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260432, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260433, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 37.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260434, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 32.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260435, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260436, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 30.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260437, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260438, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260439, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 30.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260440, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 47.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260441, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 43.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260442, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 42.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260443, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260444, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 44.0, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260445, "useless_questions_ratio": 0.7142857142857143} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 35.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260446, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260447, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260448, "useless_questions_ratio": 0.7142857142857143} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260449, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 43.0, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260450, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260451, "useless_questions_ratio": 0.7142857142857143} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260452, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 45.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260453, "useless_questions_ratio": 0.8571428571428571} +{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260454, "useless_questions_ratio": 0.8571428571428571} diff --git a/eval/trained_eval_summary.json b/eval/trained_eval_summary.json new file mode 100644 index 0000000..b33faa2 --- /dev/null +++ b/eval/trained_eval_summary.json @@ -0,0 +1,52 @@ +[ + { + "agent": "keyword_spam", + "avg_evidence_timing": 0, + "avg_primary_reward": 0.0, + "avg_reward": 0.07300000000000001, + "avg_surface_rate": 0.0, + "avg_trigger_rate": 0.6777777777777778, + "avg_useless_ratio": 0.48000000000000004, + "episodes": 30 + }, + { + "agent": "present_all", + "avg_evidence_timing": 0, + "avg_primary_reward": 0.0, + "avg_reward": 0.0, + "avg_surface_rate": 0.0, + "avg_trigger_rate": 0.0, + "avg_useless_ratio": 0.0, + "episodes": 30 + }, + { + "agent": "random", + "avg_evidence_timing": 0, + "avg_primary_reward": 0.0, + "avg_reward": 0.0, + "avg_surface_rate": 0.0, + "avg_trigger_rate": 0.0, + "avg_useless_ratio": 1.0, + "episodes": 30 + }, + { + "agent": "scripted_oracle", + "avg_evidence_timing": 1.9, + "avg_primary_reward": 0.95, + "avg_reward": 0.9023333333333334, + "avg_surface_rate": 0.95, + "avg_trigger_rate": 0.95, + "avg_useless_ratio": 0.0, + "episodes": 30 + }, + { + "agent": "trained_sft_grpo_run2", + "avg_evidence_timing": 0.8, + "avg_primary_reward": 0.4611111111111111, + "avg_reward": 0.3868888888888889, + "avg_surface_rate": 0.4611111111111111, + "avg_trigger_rate": 0.5888888888888889, + "avg_useless_ratio": 0.8285714285714285, + "episodes": 30 + } +] \ No newline at end of file diff --git a/eval/trained_eval_transcripts.md b/eval/trained_eval_transcripts.md new file mode 100644 index 0000000..f98a53b --- /dev/null +++ b/eval/trained_eval_transcripts.md @@ -0,0 +1,137 @@ +# Agent: trained_sft_grpo_run2 + +## timeline_255d67 (easy) + +Esme Dubois is accused of assault at the public library on Walnut Street on May 28th. The victim claims the assault occurred at 11:45 PM. The defendant claims they were elsewhere at that time. + +### Transcript +- Step 1 `ask_question`: Esme Dubois is accused of assault at the public library on Walnut Street on May 28th. The victim claims the assault occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time. + - Witness: The assault happened at 11:00 PM, not 11:45 PM. + - Triggered: c1_time_of_assault; Surfaced: none +- Step 2 `present_evidence`: victor_statement + - Witness: Invalid exhibit ID. + - Triggered: c1_time_of_assault; Surfaced: none +- Step 3 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time. + - Witness: The assault happened at 11:00 PM, not 11:45 PM. + - Triggered: c1_time_of_assault; Surfaced: none +- Step 4 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time. + - Witness: You've already asked that question. + - Triggered: c1_time_of_assault; Surfaced: none +- Step 5 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time. + - Witness: You've already asked that question. + - Triggered: c1_time_of_assault; Surfaced: none +- Step 6 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time. + - Witness: You've already asked that question. + - Triggered: c1_time_of_assault; Surfaced: none +- Step 7 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time. + - Witness: You've already asked that question. + - Triggered: c1_time_of_assault; Surfaced: none +- Step 8 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time. + - Witness: You've already asked that question. + - Triggered: c1_time_of_assault; Surfaced: none +- Step 9 `rest_case`: rest_case + - Witness: [Counsel rests.] + - Triggered: c1_time_of_assault; Surfaced: none + +### Reward +```json +{"primary_reward": 0.0, "auxiliary_reward_raw": 0.05000000000000002, "total_reward": 0.010000000000000004, "contradictions_total": 1.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 0.0, "keyword_questions": 2.0, "evidence_timing_successes": 0.0, "blind_evidence_count": 0.0, "duplicate_questions": 5.0, "irrelevant_questions": 0.0, "inadmissible_actions": 1.0, "useless_questions_ratio": 0.7142857142857143, "avg_question_length": 39.0} +``` + +--- + +# Agent: trained_sft_grpo_run2 + +## knowledge_b28f8c (medium) + +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + +### Transcript +- Step 1 `ask_question`: CASE BRIEF: +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + - Witness: I have never met Kavita Iyer in my life. + - Triggered: c1_know_victim; Surfaced: none +- Step 2 `present_evidence`: email_chain + - Witness: [Witness stammers] I... I'm not sure what to say. + - Triggered: c1_know_victim; Surfaced: c1_know_victim +- Step 3 `ask_question`: CASE BRIEF: +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + - Witness: You've already asked that question. + - Triggered: c1_know_victim; Surfaced: c1_know_victim +- Step 4 `ask_question`: CASE BRIEF: +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + - Witness: You've already asked that question. + - Triggered: c1_know_victim; Surfaced: c1_know_victim +- Step 5 `ask_question`: CASE BRIEF: +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + - Witness: You've already asked that question. + - Triggered: c1_know_victim; Surfaced: c1_know_victim +- Step 6 `ask_question`: CASE BRIEF: +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + - Witness: You've already asked that question. + - Triggered: c1_know_victim; Surfaced: c1_know_victim +- Step 7 `ask_question`: CASE BRIEF: +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + - Witness: You've already asked that question. + - Triggered: c1_know_victim; Surfaced: c1_know_victim +- Step 8 `ask_question`: CASE BRIEF: +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + - Witness: You've already asked that question. + - Triggered: c1_know_victim; Surfaced: c1_know_victim +- Step 9 `rest_case`: rest_case + - Witness: [Counsel rests.] + - Triggered: c1_know_victim; Surfaced: c1_know_victim + +### Reward +```json +{"primary_reward": 0.5, "auxiliary_reward_raw": 0.09999999999999998, "total_reward": 0.42000000000000004, "contradictions_total": 2.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 1.0, "keyword_questions": 1.0, "evidence_timing_successes": 1.0, "blind_evidence_count": 0.0, "duplicate_questions": 6.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.8571428571428571, "avg_question_length": 45.0} +``` + +--- + +# Agent: trained_sft_grpo_run2 + +## workplace_c98377 (easy) + +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + +### Transcript +- Step 1 `ask_question`: CASE BRIEF: +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + - Witness: I never sent Ravi Joshi any threatening message. + - Triggered: c1_message_denial; Surfaced: none +- Step 2 `present_evidence`: message_export + - Witness: [Witness stammers] I... I'm not sure what to say. + - Triggered: c1_message_denial; Surfaced: c1_message_denial +- Step 3 `ask_question`: CASE BRIEF: +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + - Witness: You've already asked that question. + - Triggered: c1_message_denial; Surfaced: c1_message_denial +- Step 4 `ask_question`: CASE BRIEF: +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + - Witness: You've already asked that question. + - Triggered: c1_message_denial; Surfaced: c1_message_denial +- Step 5 `ask_question`: CASE BRIEF: +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + - Witness: You've already asked that question. + - Triggered: c1_message_denial; Surfaced: c1_message_denial +- Step 6 `ask_question`: CASE BRIEF: +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + - Witness: You've already asked that question. + - Triggered: c1_message_denial; Surfaced: c1_message_denial +- Step 7 `ask_question`: CASE BRIEF: +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + - Witness: You've already asked that question. + - Triggered: c1_message_denial; Surfaced: c1_message_denial +- Step 8 `ask_question`: CASE BRIEF: +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + - Witness: You've already asked that question. + - Triggered: c1_message_denial; Surfaced: c1_message_denial +- Step 9 `rest_case`: rest_case + - Witness: [Counsel rests.] + - Triggered: c1_message_denial; Surfaced: c1_message_denial + +### Reward +```json +{"primary_reward": 1.0, "auxiliary_reward_raw": 0.09999999999999998, "total_reward": 0.8200000000000001, "contradictions_total": 1.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 1.0, "keyword_questions": 1.0, "evidence_timing_successes": 1.0, "blind_evidence_count": 0.0, "duplicate_questions": 6.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.8571428571428571, "avg_question_length": 31.0} +``` \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..0ecce6e --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.6.2" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..3a98e07 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eb33e25e043c4103e6cee6aa637be513e7c2c0b7afb8cdd2325c7891721e22f +size 2384234968 diff --git a/sft_warm_start/chat_template.jinja b/sft_warm_start/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/sft_warm_start/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/sft_warm_start/config.json b/sft_warm_start/config.json new file mode 100644 index 0000000..237a3a1 --- /dev/null +++ b/sft_warm_start/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.6.2", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/sft_warm_start/generation_config.json b/sft_warm_start/generation_config.json new file mode 100644 index 0000000..0ecce6e --- /dev/null +++ b/sft_warm_start/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.6.2" +} diff --git a/sft_warm_start/model.safetensors b/sft_warm_start/model.safetensors new file mode 100644 index 0000000..bc3c022 --- /dev/null +++ b/sft_warm_start/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa735583653d63e0b164d0872f4bd2aa4147f31ba523b70fff69fb169b24869 +size 1192135096 diff --git a/sft_warm_start/tokenizer.json b/sft_warm_start/tokenizer.json new file mode 100644 index 0000000..258f1f5 --- /dev/null +++ b/sft_warm_start/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3cdba37323e45d3a891de61878a08412fb1d85f4ade068eeffc03740ec9e275 +size 11422748 diff --git a/sft_warm_start/tokenizer_config.json b/sft_warm_start/tokenizer_config.json new file mode 100644 index 0000000..af5f35b --- /dev/null +++ b/sft_warm_start/tokenizer_config.json @@ -0,0 +1,75 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "local_files_only": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "reasoning_content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object" + }, + "name": { + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "function" + } + }, + "type": "object", + "x-parser": "json", + "x-parser-args": { + "transform": "{type: 'function', function: @}" + } + }, + "type": "array", + "x-regex-iterator": "\\s*(.+?)\\s*" + } + }, + "type": "object", + "x-regex": "^(?:\\n?(?:(?P.*?\\S.*?)\\n?|[\\s]*)\\s*)?(?P.*?)(?:\\n(?=))?(?=(?:|<\\|im_end\\|>|$))(?P(?:.+?\\s*)+)?\\s*(?:<\\|im_end\\|>|$)" + }, + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "truncation_side": "left", + "unk_token": null +} diff --git a/sft_warm_start/training_args.bin b/sft_warm_start/training_args.bin new file mode 100644 index 0000000..55c43cb --- /dev/null +++ b/sft_warm_start/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44e78eec69cef56830cc59b94a060abe906ff4571c26fb4566a6bdcb27e4499d +size 5265 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..f8a0cff --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,78 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "local_files_only": false, + "max_length": 3072, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "reasoning_content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object" + }, + "name": { + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "function" + } + }, + "type": "object", + "x-parser": "json", + "x-parser-args": { + "transform": "{type: 'function', function: @}" + } + }, + "type": "array", + "x-regex-iterator": "\\s*(.+?)\\s*" + } + }, + "type": "object", + "x-regex": "^(?:\\n?(?:(?P.*?\\S.*?)\\n?|[\\s]*)\\s*)?(?P.*?)(?:\\n(?=))?(?=(?:|<\\|im_end\\|>|$))(?P(?:.+?\\s*)+)?\\s*(?:<\\|im_end\\|>|$)" + }, + "split_special_tokens": false, + "stride": 0, + "tokenizer_class": "Qwen2Tokenizer", + "truncation_side": "left", + "truncation_strategy": "longest_first", + "unk_token": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..2f7d0b0 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:623158c464bdeaec4fa82a52817596e39771e95ece6659a737aa9e9d07d03327 +size 7185 diff --git a/training_summary.json b/training_summary.json new file mode 100644 index 0000000..7fb312e --- /dev/null +++ b/training_summary.json @@ -0,0 +1,23 @@ +{ + "artifact_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", + "env_url": "https://heavycoderhh-counsel-env.hf.space", + "evidence_pressure": 1.4, + "grpo_dataset_size": 160, + "grpo_learning_rate": 5e-06, + "grpo_max_steps": 250, + "max_completion_length": 320, + "metrics": { + "total_flos": 0.0, + "train_loss": -0.00972448107972741, + "train_runtime": 4103.8977, + "train_samples_per_second": 0.244, + "train_steps_per_second": 0.061 + }, + "model": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", + "num_generations": 4, + "sft_dataset_size": 320, + "sft_dir": "/tmp/counsel-sft-grpo-output/sft_warm_start", + "sft_epochs": 1.0, + "sft_learning_rate": 1e-05, + "space_repo": "heavycoderhh/counsel-env" +} \ No newline at end of file