commit 2ea3d793ee09253a2d0bd70ea827742873552d54 Author: ModelHub XC Date: Tue Jun 16 07:21:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: heavycoderhh/counsel-env-qwen3-0.6b-grpo Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..fd0b216 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..c1aba24 --- /dev/null +++ b/README.md @@ -0,0 +1,68 @@ +--- +base_model: Qwen/Qwen3-0.6B +library_name: transformers +model_name: counsel-grpo-output +tags: +- generated_from_trainer +- trl +- grpo +- hf_jobs +licence: license +--- + +# Model Card for counsel-grpo-output + +This model is a fine-tuned version of [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 1.2.0 +- Transformers: 5.6.2 +- Pytorch: 2.11.0 +- Datasets: 4.8.4 +- Tokenizers: 0.22.2 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-100/chat_template.jinja b/checkpoint-100/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/checkpoint-100/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-100/config.json b/checkpoint-100/config.json new file mode 100644 index 0000000..9b2e878 --- /dev/null +++ b/checkpoint-100/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.6.2", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-100/generation_config.json b/checkpoint-100/generation_config.json new file mode 100644 index 0000000..0ecce6e --- /dev/null +++ b/checkpoint-100/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.6.2" +} diff --git a/checkpoint-100/model.safetensors b/checkpoint-100/model.safetensors new file mode 100644 index 0000000..3dae6c2 --- /dev/null +++ b/checkpoint-100/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6d4fdb7c0c098116c54803446e70c9228a6cdc56ed3830ea72a8c070fd4bb3 +size 2384234968 diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000..8b3fe1a --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec322709818f7f042ee53f6a8d9cddad701506a6aba2fabbcc3dde446f17f939 +size 4768669395 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000..46854d3 --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124ff4ef2c0581fffe34c8548d45e1106badea301ebd85e0d9fbe9036540ae75 +size 14645 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000..0158101 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8af82574c5bd9fbcf30df6ea22f7df3c2ee640fc984580fab0f8dbb06ae38110 +size 1465 diff --git a/checkpoint-100/tokenizer.json b/checkpoint-100/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/checkpoint-100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000..af5f35b --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,75 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "local_files_only": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "reasoning_content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object" + }, + "name": { + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "function" + } + }, + "type": "object", + "x-parser": "json", + "x-parser-args": { + "transform": "{type: 'function', function: @}" + } + }, + "type": "array", + "x-regex-iterator": "\\s*(.+?)\\s*" + } + }, + "type": "object", + "x-regex": "^(?:\\n?(?:(?P.*?\\S.*?)\\n?|[\\s]*)\\s*)?(?P.*?)(?:\\n(?=))?(?=(?:|<\\|im_end\\|>|$))(?P(?:.+?\\s*)+)?\\s*(?:<\\|im_end\\|>|$)" + }, + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "truncation_side": "left", + "unk_token": null +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000..1aba94c --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.390625, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.4, + "completions/max_terminated_length": 152.4, + "completions/mean_length": 117.8, + "completions/mean_terminated_length": 117.8, + "completions/min_length": 93.8, + "completions/min_terminated_length": 93.8, + "entropy": 0.20053473562002183, + "epoch": 0.01953125, + "frac_reward_zero_std": 0.0, + "grad_norm": 20.259794235229492, + "kl": 0.1606330933049321, + "learning_rate": 9.800000000000001e-06, + "loss": 0.020250317454338074, + "num_tokens": 11544.0, + "reward": 0.4544999921694398, + "reward_std": 0.2574530947953463, + "rewards/reward_func/mean": 0.4544999921694398, + "rewards/reward_func/std": 0.2574530977755785, + "step": 5, + "step_time": 8.736884885399922, + "tools/call_frequency": 3.1, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.8, + "completions/max_terminated_length": 205.8, + "completions/mean_length": 174.4, + "completions/mean_terminated_length": 174.4, + "completions/min_length": 140.6, + "completions/min_terminated_length": 140.6, + "entropy": 0.45406929701566695, + "epoch": 0.0390625, + "frac_reward_zero_std": 0.0, + "grad_norm": 20.727792739868164, + "kl": 0.5858453318476677, + "learning_rate": 9.55e-06, + "loss": 0.00982179045677185, + "num_tokens": 24254.0, + "reward": 0.08549999967217445, + "reward_std": 0.14715017192065716, + "rewards/reward_func/mean": 0.08549999967217445, + "rewards/reward_func/std": 0.14715017192065716, + "step": 10, + "step_time": 11.54933958799811, + "tools/call_frequency": 3.45, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 320.8, + "completions/max_terminated_length": 233.6, + "completions/mean_length": 232.35, + "completions/mean_terminated_length": 189.8, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.5534275218844413, + "epoch": 0.05859375, + "frac_reward_zero_std": 0.2, + "grad_norm": 6.85633659362793, + "kl": 0.6838902086019516, + "learning_rate": 9.3e-06, + "loss": 0.026860833168029785, + "num_tokens": 38189.0, + "reward": 0.10483333505690098, + "reward_std": 0.16720814146101476, + "rewards/reward_func/mean": 0.10483333505690098, + "rewards/reward_func/std": 0.1672081384807825, + "step": 15, + "step_time": 18.40421949800075, + "tools/call_frequency": 4.35, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 413.0, + "completions/max_terminated_length": 351.2, + "completions/mean_length": 340.5, + "completions/mean_terminated_length": 306.28333435058596, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.5062903374433517, + "epoch": 0.078125, + "frac_reward_zero_std": 0.4, + "grad_norm": 6.486283302307129, + "kl": 0.5253867790102958, + "learning_rate": 9.050000000000001e-06, + "loss": -0.045569175481796266, + "num_tokens": 54196.0, + "reward": 0.09649999756366015, + "reward_std": 0.10897674113512039, + "rewards/reward_func/mean": 0.09649999756366015, + "rewards/reward_func/std": 0.10897674113512039, + "step": 20, + "step_time": 29.913843032000294, + "tools/call_frequency": 3.95, + "tools/failure_frequency": 0.22069264352321624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 480.4, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 361.25, + "completions/mean_terminated_length": 225.56666870117186, + "completions/min_length": 247.4, + "completions/min_terminated_length": 202.8, + "entropy": 0.4256091395393014, + "epoch": 0.09765625, + "frac_reward_zero_std": 0.2, + "grad_norm": 4.082395553588867, + "kl": 0.43983293175697324, + "learning_rate": 8.8e-06, + "loss": 0.02868407070636749, + "num_tokens": 70641.0, + "reward": 0.0399999987334013, + "reward_std": 0.060056403279304504, + "rewards/reward_func/mean": 0.0399999987334013, + "rewards/reward_func/std": 0.060056403279304504, + "step": 25, + "step_time": 35.14591444559992, + "tools/call_frequency": 3.7, + "tools/failure_frequency": 0.18648459613323212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 342.0, + "completions/max_terminated_length": 286.6, + "completions/mean_length": 267.45, + "completions/mean_terminated_length": 208.86666870117188, + "completions/min_length": 93.2, + "completions/min_terminated_length": 93.2, + "entropy": 0.27065443359315394, + "epoch": 0.1171875, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.178186893463135, + "kl": 0.6691923886537552, + "learning_rate": 8.550000000000001e-06, + "loss": -0.30545821189880373, + "num_tokens": 85232.0, + "reward": 0.029999999329447746, + "reward_std": 0.019999999552965164, + "rewards/reward_func/mean": 0.029999999329447746, + "rewards/reward_func/std": 0.019999999552965164, + "step": 30, + "step_time": 21.171669380999084, + "tools/call_frequency": 5.3, + "tools/failure_frequency": 0.30696970224380493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 314.6, + "completions/max_terminated_length": 300.6, + "completions/mean_length": 275.95, + "completions/mean_terminated_length": 258.2, + "completions/min_length": 191.6, + "completions/min_terminated_length": 193.2, + "entropy": 0.12053632475435734, + "epoch": 0.13671875, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0749428272247314, + "kl": 0.6381830915808677, + "learning_rate": 8.3e-06, + "loss": -0.07461150288581848, + "num_tokens": 100010.0, + "reward": 0.023999999463558196, + "reward_std": 0.021856406703591347, + "rewards/reward_func/mean": 0.023999999463558196, + "rewards/reward_func/std": 0.021856406703591347, + "step": 35, + "step_time": 18.075483031600744, + "tools/call_frequency": 7.2, + "tools/failure_frequency": 0.3397361934185028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 406.2, + "completions/max_terminated_length": 194.6, + "completions/mean_length": 318.55, + "completions/mean_terminated_length": 139.6666717529297, + "completions/min_length": 207.2, + "completions/min_terminated_length": 81.6, + "entropy": 0.25037811435759066, + "epoch": 0.15625, + "frac_reward_zero_std": 0.2, + "grad_norm": 5.45590877532959, + "kl": 0.632529079169035, + "learning_rate": 8.050000000000001e-06, + "loss": -0.08673273324966431, + "num_tokens": 115597.0, + "reward": 0.013999999687075614, + "reward_std": 0.016618802025914193, + "rewards/reward_func/mean": 0.013999999687075614, + "rewards/reward_func/std": 0.016618802025914193, + "step": 40, + "step_time": 25.49324105279957, + "tools/call_frequency": 4.8, + "tools/failure_frequency": 0.2961344659328461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 488.8, + "completions/max_terminated_length": 167.4, + "completions/mean_length": 351.65, + "completions/mean_terminated_length": 126.56666870117188, + "completions/min_length": 218.6, + "completions/min_terminated_length": 102.6, + "entropy": 0.30095756258815526, + "epoch": 0.17578125, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.2991526126861572, + "kl": 0.549846900999546, + "learning_rate": 7.800000000000002e-06, + "loss": 0.01944226920604706, + "num_tokens": 131856.0, + "reward": 0.017999999597668646, + "reward_std": 0.016618802025914193, + "rewards/reward_func/mean": 0.017999999597668646, + "rewards/reward_func/std": 0.016618802025914193, + "step": 45, + "step_time": 35.21552088999961, + "tools/call_frequency": 3.3, + "tools/failure_frequency": 0.2740601599216461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 381.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 294.35, + "completions/mean_terminated_length": 227.53333435058593, + "completions/min_length": 160.8, + "completions/min_terminated_length": 160.8, + "entropy": 0.23832304682582617, + "epoch": 0.1953125, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.2268025130033493, + "kl": 0.6058375500142574, + "learning_rate": 7.5500000000000006e-06, + "loss": -0.08301904201507568, + "num_tokens": 146914.0, + "reward": 0.023999999463558196, + "reward_std": 0.018475209176540375, + "rewards/reward_func/mean": 0.023999999463558196, + "rewards/reward_func/std": 0.018475209176540375, + "step": 50, + "step_time": 25.430086624599788, + "tools/call_frequency": 5.5, + "tools/failure_frequency": 0.30512219071388247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 339.2, + "completions/max_terminated_length": 290.2, + "completions/mean_length": 287.3, + "completions/mean_terminated_length": 284.8333374023438, + "completions/min_length": 262.0, + "completions/min_terminated_length": 279.2, + "entropy": 0.05971384542062878, + "epoch": 0.21484375, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.22013917565345764, + "kl": 0.5083232939243316, + "learning_rate": 7.3e-06, + "loss": 0.0854921281337738, + "num_tokens": 161945.0, + "reward": 0.031999999284744264, + "reward_std": 0.012618802115321159, + "rewards/reward_func/mean": 0.031999999284744264, + "rewards/reward_func/std": 0.012618802115321159, + "step": 55, + "step_time": 21.9785563156016, + "tools/call_frequency": 8.3, + "tools/failure_frequency": 0.3500013709068298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 264.6, + "completions/max_terminated_length": 219.6, + "completions/mean_length": 249.7, + "completions/mean_terminated_length": 217.26666870117188, + "completions/min_length": 213.4, + "completions/min_terminated_length": 215.6, + "entropy": 0.030067690974101426, + "epoch": 0.234375, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.1762969493865967, + "kl": 0.5388822212815285, + "learning_rate": 7.05e-06, + "loss": -0.06041957139968872, + "num_tokens": 176143.0, + "reward": 0.029999999329447746, + "reward_std": 0.016618802025914193, + "rewards/reward_func/mean": 0.029999999329447746, + "rewards/reward_func/std": 0.016618802025914193, + "step": 60, + "step_time": 16.232493841598625, + "tools/call_frequency": 8.85, + "tools/failure_frequency": 0.3545756280422211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 288.2, + "completions/max_terminated_length": 148.2, + "completions/mean_length": 213.95, + "completions/mean_terminated_length": 137.4000030517578, + "completions/min_length": 181.4, + "completions/min_terminated_length": 129.6, + "entropy": 0.07268587870057672, + "epoch": 0.25390625, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1308048814535141, + "kl": 0.8497638419270516, + "learning_rate": 6.800000000000001e-06, + "loss": -0.2692496538162231, + "num_tokens": 189629.0, + "reward": 0.03799999877810478, + "reward_std": 0.010309400968253613, + "rewards/reward_func/mean": 0.03799999877810478, + "rewards/reward_func/std": 0.010309400968253613, + "step": 65, + "step_time": 17.90531996159916, + "tools/call_frequency": 5.85, + "tools/failure_frequency": 0.3754902005195618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3, + "completions/max_length": 274.6, + "completions/max_terminated_length": 256.6, + "completions/mean_length": 248.25, + "completions/mean_terminated_length": 241.73333740234375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.05142406928353012, + "epoch": 0.2734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09222911298274994, + "kl": 0.57062017172575, + "learning_rate": 6.550000000000001e-06, + "loss": 0.02273084670305252, + "num_tokens": 203795.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 70, + "step_time": 16.734460433397906, + "tools/call_frequency": 9.3, + "tools/failure_frequency": 0.1542351394891739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 303.4, + "completions/max_terminated_length": 242.2, + "completions/mean_length": 244.05, + "completions/mean_terminated_length": 231.43333435058594, + "completions/min_length": 215.6, + "completions/min_terminated_length": 222.2, + "entropy": 0.03808064609766006, + "epoch": 0.29296875, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.2428537756204605, + "kl": 0.49708728566765786, + "learning_rate": 6.300000000000001e-06, + "loss": 0.08911536335945129, + "num_tokens": 217958.0, + "reward": 0.0569999985396862, + "reward_std": 0.006000000238418579, + "rewards/reward_func/mean": 0.0569999985396862, + "rewards/reward_func/std": 0.005999999865889549, + "step": 75, + "step_time": 18.7449420207995, + "tools/call_frequency": 9.25, + "tools/failure_frequency": 0.09236167445778846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 298.6, + "completions/max_terminated_length": 238.2, + "completions/mean_length": 243.95, + "completions/mean_terminated_length": 229.53333435058593, + "completions/min_length": 217.8, + "completions/min_terminated_length": 221.8, + "entropy": 0.03462462190072983, + "epoch": 0.3125, + "frac_reward_zero_std": 0.6, + "grad_norm": 3.802891969680786, + "kl": 0.48679155968129634, + "learning_rate": 6.0500000000000005e-06, + "loss": 0.0853162169456482, + "num_tokens": 232107.0, + "reward": 0.05399999842047691, + "reward_std": 0.012000000476837159, + "rewards/reward_func/mean": 0.05399999842047691, + "rewards/reward_func/std": 0.011999999731779098, + "step": 80, + "step_time": 18.624738503398838, + "tools/call_frequency": 9.4, + "tools/failure_frequency": 0.1012024775147438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 315.8, + "completions/max_terminated_length": 245.4, + "completions/mean_length": 287.0, + "completions/mean_terminated_length": 223.70000305175782, + "completions/min_length": 235.4, + "completions/min_terminated_length": 188.8, + "entropy": 0.018990062386728825, + "epoch": 0.33203125, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.24913819134235382, + "kl": 0.36854752600193025, + "learning_rate": 5.8e-06, + "loss": -0.06027485728263855, + "num_tokens": 247068.0, + "reward": 0.0569999985396862, + "reward_std": 0.006000000238418579, + "rewards/reward_func/mean": 0.0569999985396862, + "rewards/reward_func/std": 0.005999999865889549, + "step": 85, + "step_time": 19.23757492739969, + "tools/call_frequency": 7.25, + "tools/failure_frequency": 0.38577277064323423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 336.4, + "completions/max_terminated_length": 313.2, + "completions/mean_length": 315.8, + "completions/mean_terminated_length": 307.6, + "completions/min_length": 291.8, + "completions/min_terminated_length": 302.0, + "entropy": 0.005942635895917192, + "epoch": 0.3515625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03795464709401131, + "kl": 0.27816674262285235, + "learning_rate": 5.550000000000001e-06, + "loss": 0.011118948459625244, + "num_tokens": 262655.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 90, + "step_time": 20.623258255800465, + "tools/call_frequency": 7.0, + "tools/failure_frequency": 0.378443706035614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3, + "completions/max_length": 326.2, + "completions/max_terminated_length": 326.2, + "completions/mean_length": 305.55, + "completions/mean_terminated_length": 311.4333435058594, + "completions/min_length": 286.4, + "completions/min_terminated_length": 300.0, + "entropy": 0.00408092990401201, + "epoch": 0.37109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03232214227318764, + "kl": 0.2662045076489449, + "learning_rate": 5.300000000000001e-06, + "loss": 0.010630997270345688, + "num_tokens": 277935.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 95, + "step_time": 19.27180065259963, + "tools/call_frequency": 7.5, + "tools/failure_frequency": 0.40017797946929934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 370.4, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 305.75, + "completions/mean_terminated_length": 274.26666870117185, + "completions/min_length": 213.2, + "completions/min_terminated_length": 213.8, + "entropy": 0.020784856198588386, + "epoch": 0.390625, + "frac_reward_zero_std": 0.6, + "grad_norm": 2.253676414489746, + "kl": 0.30799318477511406, + "learning_rate": 5.050000000000001e-06, + "loss": -0.07253098487854004, + "num_tokens": 293245.0, + "reward": 0.050999998673796655, + "reward_std": 0.012928203493356705, + "rewards/reward_func/mean": 0.050999998673796655, + "rewards/reward_func/std": 0.012928203120827675, + "step": 100, + "step_time": 22.40414413039907, + "tools/call_frequency": 5.7, + "tools/failure_frequency": 0.3851664662361145 + } + ], + "logging_steps": 5, + "max_steps": 200, + "num_input_tokens_seen": 293245, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000..5bea77b --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2ebe18213235a9211aa4bf0c3778da36ad2990f51c4dddec3ffae0a6be7033 +size 7185 diff --git a/checkpoint-200/chat_template.jinja b/checkpoint-200/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/checkpoint-200/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-200/config.json b/checkpoint-200/config.json new file mode 100644 index 0000000..9b2e878 --- /dev/null +++ b/checkpoint-200/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.6.2", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-200/generation_config.json b/checkpoint-200/generation_config.json new file mode 100644 index 0000000..0ecce6e --- /dev/null +++ b/checkpoint-200/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.6.2" +} diff --git a/checkpoint-200/model.safetensors b/checkpoint-200/model.safetensors new file mode 100644 index 0000000..cdba477 --- /dev/null +++ b/checkpoint-200/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a74651f230a1644b02fec7706be28f5209ebe746a4876a1df7c07a294a345f11 +size 2384234968 diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000..c5c2dbb --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:930f9c60a28c026305674220377bf9d08a9e5cc2f2bfbeac67bf8402953d1cb2 +size 4768669395 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000..f1c6968 --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8e9bbdccf9538bc08ddd052112e929691fdd087026390ef5e8ad8aa61f690da +size 14645 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000..5f21895 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a7cdd550d6ad45e0803294bb002a4956fd107348f20422f2f106918ad1bbd8 +size 1465 diff --git a/checkpoint-200/tokenizer.json b/checkpoint-200/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/checkpoint-200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000..af5f35b --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,75 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "local_files_only": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "reasoning_content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object" + }, + "name": { + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "function" + } + }, + "type": "object", + "x-parser": "json", + "x-parser-args": { + "transform": "{type: 'function', function: @}" + } + }, + "type": "array", + "x-regex-iterator": "\\s*(.+?)\\s*" + } + }, + "type": "object", + "x-regex": "^(?:\\n?(?:(?P.*?\\S.*?)\\n?|[\\s]*)\\s*)?(?P.*?)(?:\\n(?=))?(?=(?:|<\\|im_end\\|>|$))(?P(?:.+?\\s*)+)?\\s*(?:<\\|im_end\\|>|$)" + }, + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "truncation_side": "left", + "unk_token": null +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000..6c1463a --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,1234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.78125, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.4, + "completions/max_terminated_length": 152.4, + "completions/mean_length": 117.8, + "completions/mean_terminated_length": 117.8, + "completions/min_length": 93.8, + "completions/min_terminated_length": 93.8, + "entropy": 0.20053473562002183, + "epoch": 0.01953125, + "frac_reward_zero_std": 0.0, + "grad_norm": 20.259794235229492, + "kl": 0.1606330933049321, + "learning_rate": 9.800000000000001e-06, + "loss": 0.020250317454338074, + "num_tokens": 11544.0, + "reward": 0.4544999921694398, + "reward_std": 0.2574530947953463, + "rewards/reward_func/mean": 0.4544999921694398, + "rewards/reward_func/std": 0.2574530977755785, + "step": 5, + "step_time": 8.736884885399922, + "tools/call_frequency": 3.1, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.8, + "completions/max_terminated_length": 205.8, + "completions/mean_length": 174.4, + "completions/mean_terminated_length": 174.4, + "completions/min_length": 140.6, + "completions/min_terminated_length": 140.6, + "entropy": 0.45406929701566695, + "epoch": 0.0390625, + "frac_reward_zero_std": 0.0, + "grad_norm": 20.727792739868164, + "kl": 0.5858453318476677, + "learning_rate": 9.55e-06, + "loss": 0.00982179045677185, + "num_tokens": 24254.0, + "reward": 0.08549999967217445, + "reward_std": 0.14715017192065716, + "rewards/reward_func/mean": 0.08549999967217445, + "rewards/reward_func/std": 0.14715017192065716, + "step": 10, + "step_time": 11.54933958799811, + "tools/call_frequency": 3.45, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 320.8, + "completions/max_terminated_length": 233.6, + "completions/mean_length": 232.35, + "completions/mean_terminated_length": 189.8, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.5534275218844413, + "epoch": 0.05859375, + "frac_reward_zero_std": 0.2, + "grad_norm": 6.85633659362793, + "kl": 0.6838902086019516, + "learning_rate": 9.3e-06, + "loss": 0.026860833168029785, + "num_tokens": 38189.0, + "reward": 0.10483333505690098, + "reward_std": 0.16720814146101476, + "rewards/reward_func/mean": 0.10483333505690098, + "rewards/reward_func/std": 0.1672081384807825, + "step": 15, + "step_time": 18.40421949800075, + "tools/call_frequency": 4.35, + "tools/failure_frequency": 0.0 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 413.0, + "completions/max_terminated_length": 351.2, + "completions/mean_length": 340.5, + "completions/mean_terminated_length": 306.28333435058596, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.5062903374433517, + "epoch": 0.078125, + "frac_reward_zero_std": 0.4, + "grad_norm": 6.486283302307129, + "kl": 0.5253867790102958, + "learning_rate": 9.050000000000001e-06, + "loss": -0.045569175481796266, + "num_tokens": 54196.0, + "reward": 0.09649999756366015, + "reward_std": 0.10897674113512039, + "rewards/reward_func/mean": 0.09649999756366015, + "rewards/reward_func/std": 0.10897674113512039, + "step": 20, + "step_time": 29.913843032000294, + "tools/call_frequency": 3.95, + "tools/failure_frequency": 0.22069264352321624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 480.4, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 361.25, + "completions/mean_terminated_length": 225.56666870117186, + "completions/min_length": 247.4, + "completions/min_terminated_length": 202.8, + "entropy": 0.4256091395393014, + "epoch": 0.09765625, + "frac_reward_zero_std": 0.2, + "grad_norm": 4.082395553588867, + "kl": 0.43983293175697324, + "learning_rate": 8.8e-06, + "loss": 0.02868407070636749, + "num_tokens": 70641.0, + "reward": 0.0399999987334013, + "reward_std": 0.060056403279304504, + "rewards/reward_func/mean": 0.0399999987334013, + "rewards/reward_func/std": 0.060056403279304504, + "step": 25, + "step_time": 35.14591444559992, + "tools/call_frequency": 3.7, + "tools/failure_frequency": 0.18648459613323212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 342.0, + "completions/max_terminated_length": 286.6, + "completions/mean_length": 267.45, + "completions/mean_terminated_length": 208.86666870117188, + "completions/min_length": 93.2, + "completions/min_terminated_length": 93.2, + "entropy": 0.27065443359315394, + "epoch": 0.1171875, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.178186893463135, + "kl": 0.6691923886537552, + "learning_rate": 8.550000000000001e-06, + "loss": -0.30545821189880373, + "num_tokens": 85232.0, + "reward": 0.029999999329447746, + "reward_std": 0.019999999552965164, + "rewards/reward_func/mean": 0.029999999329447746, + "rewards/reward_func/std": 0.019999999552965164, + "step": 30, + "step_time": 21.171669380999084, + "tools/call_frequency": 5.3, + "tools/failure_frequency": 0.30696970224380493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 314.6, + "completions/max_terminated_length": 300.6, + "completions/mean_length": 275.95, + "completions/mean_terminated_length": 258.2, + "completions/min_length": 191.6, + "completions/min_terminated_length": 193.2, + "entropy": 0.12053632475435734, + "epoch": 0.13671875, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0749428272247314, + "kl": 0.6381830915808677, + "learning_rate": 8.3e-06, + "loss": -0.07461150288581848, + "num_tokens": 100010.0, + "reward": 0.023999999463558196, + "reward_std": 0.021856406703591347, + "rewards/reward_func/mean": 0.023999999463558196, + "rewards/reward_func/std": 0.021856406703591347, + "step": 35, + "step_time": 18.075483031600744, + "tools/call_frequency": 7.2, + "tools/failure_frequency": 0.3397361934185028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 406.2, + "completions/max_terminated_length": 194.6, + "completions/mean_length": 318.55, + "completions/mean_terminated_length": 139.6666717529297, + "completions/min_length": 207.2, + "completions/min_terminated_length": 81.6, + "entropy": 0.25037811435759066, + "epoch": 0.15625, + "frac_reward_zero_std": 0.2, + "grad_norm": 5.45590877532959, + "kl": 0.632529079169035, + "learning_rate": 8.050000000000001e-06, + "loss": -0.08673273324966431, + "num_tokens": 115597.0, + "reward": 0.013999999687075614, + "reward_std": 0.016618802025914193, + "rewards/reward_func/mean": 0.013999999687075614, + "rewards/reward_func/std": 0.016618802025914193, + "step": 40, + "step_time": 25.49324105279957, + "tools/call_frequency": 4.8, + "tools/failure_frequency": 0.2961344659328461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 488.8, + "completions/max_terminated_length": 167.4, + "completions/mean_length": 351.65, + "completions/mean_terminated_length": 126.56666870117188, + "completions/min_length": 218.6, + "completions/min_terminated_length": 102.6, + "entropy": 0.30095756258815526, + "epoch": 0.17578125, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.2991526126861572, + "kl": 0.549846900999546, + "learning_rate": 7.800000000000002e-06, + "loss": 0.01944226920604706, + "num_tokens": 131856.0, + "reward": 0.017999999597668646, + "reward_std": 0.016618802025914193, + "rewards/reward_func/mean": 0.017999999597668646, + "rewards/reward_func/std": 0.016618802025914193, + "step": 45, + "step_time": 35.21552088999961, + "tools/call_frequency": 3.3, + "tools/failure_frequency": 0.2740601599216461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 381.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 294.35, + "completions/mean_terminated_length": 227.53333435058593, + "completions/min_length": 160.8, + "completions/min_terminated_length": 160.8, + "entropy": 0.23832304682582617, + "epoch": 0.1953125, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.2268025130033493, + "kl": 0.6058375500142574, + "learning_rate": 7.5500000000000006e-06, + "loss": -0.08301904201507568, + "num_tokens": 146914.0, + "reward": 0.023999999463558196, + "reward_std": 0.018475209176540375, + "rewards/reward_func/mean": 0.023999999463558196, + "rewards/reward_func/std": 0.018475209176540375, + "step": 50, + "step_time": 25.430086624599788, + "tools/call_frequency": 5.5, + "tools/failure_frequency": 0.30512219071388247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 339.2, + "completions/max_terminated_length": 290.2, + "completions/mean_length": 287.3, + "completions/mean_terminated_length": 284.8333374023438, + "completions/min_length": 262.0, + "completions/min_terminated_length": 279.2, + "entropy": 0.05971384542062878, + "epoch": 0.21484375, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.22013917565345764, + "kl": 0.5083232939243316, + "learning_rate": 7.3e-06, + "loss": 0.0854921281337738, + "num_tokens": 161945.0, + "reward": 0.031999999284744264, + "reward_std": 0.012618802115321159, + "rewards/reward_func/mean": 0.031999999284744264, + "rewards/reward_func/std": 0.012618802115321159, + "step": 55, + "step_time": 21.9785563156016, + "tools/call_frequency": 8.3, + "tools/failure_frequency": 0.3500013709068298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 264.6, + "completions/max_terminated_length": 219.6, + "completions/mean_length": 249.7, + "completions/mean_terminated_length": 217.26666870117188, + "completions/min_length": 213.4, + "completions/min_terminated_length": 215.6, + "entropy": 0.030067690974101426, + "epoch": 0.234375, + "frac_reward_zero_std": 0.2, + "grad_norm": 2.1762969493865967, + "kl": 0.5388822212815285, + "learning_rate": 7.05e-06, + "loss": -0.06041957139968872, + "num_tokens": 176143.0, + "reward": 0.029999999329447746, + "reward_std": 0.016618802025914193, + "rewards/reward_func/mean": 0.029999999329447746, + "rewards/reward_func/std": 0.016618802025914193, + "step": 60, + "step_time": 16.232493841598625, + "tools/call_frequency": 8.85, + "tools/failure_frequency": 0.3545756280422211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 288.2, + "completions/max_terminated_length": 148.2, + "completions/mean_length": 213.95, + "completions/mean_terminated_length": 137.4000030517578, + "completions/min_length": 181.4, + "completions/min_terminated_length": 129.6, + "entropy": 0.07268587870057672, + "epoch": 0.25390625, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1308048814535141, + "kl": 0.8497638419270516, + "learning_rate": 6.800000000000001e-06, + "loss": -0.2692496538162231, + "num_tokens": 189629.0, + "reward": 0.03799999877810478, + "reward_std": 0.010309400968253613, + "rewards/reward_func/mean": 0.03799999877810478, + "rewards/reward_func/std": 0.010309400968253613, + "step": 65, + "step_time": 17.90531996159916, + "tools/call_frequency": 5.85, + "tools/failure_frequency": 0.3754902005195618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3, + "completions/max_length": 274.6, + "completions/max_terminated_length": 256.6, + "completions/mean_length": 248.25, + "completions/mean_terminated_length": 241.73333740234375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.05142406928353012, + "epoch": 0.2734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09222911298274994, + "kl": 0.57062017172575, + "learning_rate": 6.550000000000001e-06, + "loss": 0.02273084670305252, + "num_tokens": 203795.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 70, + "step_time": 16.734460433397906, + "tools/call_frequency": 9.3, + "tools/failure_frequency": 0.1542351394891739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 303.4, + "completions/max_terminated_length": 242.2, + "completions/mean_length": 244.05, + "completions/mean_terminated_length": 231.43333435058594, + "completions/min_length": 215.6, + "completions/min_terminated_length": 222.2, + "entropy": 0.03808064609766006, + "epoch": 0.29296875, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.2428537756204605, + "kl": 0.49708728566765786, + "learning_rate": 6.300000000000001e-06, + "loss": 0.08911536335945129, + "num_tokens": 217958.0, + "reward": 0.0569999985396862, + "reward_std": 0.006000000238418579, + "rewards/reward_func/mean": 0.0569999985396862, + "rewards/reward_func/std": 0.005999999865889549, + "step": 75, + "step_time": 18.7449420207995, + "tools/call_frequency": 9.25, + "tools/failure_frequency": 0.09236167445778846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 298.6, + "completions/max_terminated_length": 238.2, + "completions/mean_length": 243.95, + "completions/mean_terminated_length": 229.53333435058593, + "completions/min_length": 217.8, + "completions/min_terminated_length": 221.8, + "entropy": 0.03462462190072983, + "epoch": 0.3125, + "frac_reward_zero_std": 0.6, + "grad_norm": 3.802891969680786, + "kl": 0.48679155968129634, + "learning_rate": 6.0500000000000005e-06, + "loss": 0.0853162169456482, + "num_tokens": 232107.0, + "reward": 0.05399999842047691, + "reward_std": 0.012000000476837159, + "rewards/reward_func/mean": 0.05399999842047691, + "rewards/reward_func/std": 0.011999999731779098, + "step": 80, + "step_time": 18.624738503398838, + "tools/call_frequency": 9.4, + "tools/failure_frequency": 0.1012024775147438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 315.8, + "completions/max_terminated_length": 245.4, + "completions/mean_length": 287.0, + "completions/mean_terminated_length": 223.70000305175782, + "completions/min_length": 235.4, + "completions/min_terminated_length": 188.8, + "entropy": 0.018990062386728825, + "epoch": 0.33203125, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.24913819134235382, + "kl": 0.36854752600193025, + "learning_rate": 5.8e-06, + "loss": -0.06027485728263855, + "num_tokens": 247068.0, + "reward": 0.0569999985396862, + "reward_std": 0.006000000238418579, + "rewards/reward_func/mean": 0.0569999985396862, + "rewards/reward_func/std": 0.005999999865889549, + "step": 85, + "step_time": 19.23757492739969, + "tools/call_frequency": 7.25, + "tools/failure_frequency": 0.38577277064323423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 336.4, + "completions/max_terminated_length": 313.2, + "completions/mean_length": 315.8, + "completions/mean_terminated_length": 307.6, + "completions/min_length": 291.8, + "completions/min_terminated_length": 302.0, + "entropy": 0.005942635895917192, + "epoch": 0.3515625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03795464709401131, + "kl": 0.27816674262285235, + "learning_rate": 5.550000000000001e-06, + "loss": 0.011118948459625244, + "num_tokens": 262655.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 90, + "step_time": 20.623258255800465, + "tools/call_frequency": 7.0, + "tools/failure_frequency": 0.378443706035614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3, + "completions/max_length": 326.2, + "completions/max_terminated_length": 326.2, + "completions/mean_length": 305.55, + "completions/mean_terminated_length": 311.4333435058594, + "completions/min_length": 286.4, + "completions/min_terminated_length": 300.0, + "entropy": 0.00408092990401201, + "epoch": 0.37109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03232214227318764, + "kl": 0.2662045076489449, + "learning_rate": 5.300000000000001e-06, + "loss": 0.010630997270345688, + "num_tokens": 277935.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 95, + "step_time": 19.27180065259963, + "tools/call_frequency": 7.5, + "tools/failure_frequency": 0.40017797946929934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 370.4, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 305.75, + "completions/mean_terminated_length": 274.26666870117185, + "completions/min_length": 213.2, + "completions/min_terminated_length": 213.8, + "entropy": 0.020784856198588386, + "epoch": 0.390625, + "frac_reward_zero_std": 0.6, + "grad_norm": 2.253676414489746, + "kl": 0.30799318477511406, + "learning_rate": 5.050000000000001e-06, + "loss": -0.07253098487854004, + "num_tokens": 293245.0, + "reward": 0.050999998673796655, + "reward_std": 0.012928203493356705, + "rewards/reward_func/mean": 0.050999998673796655, + "rewards/reward_func/std": 0.012928203120827675, + "step": 100, + "step_time": 22.40414413039907, + "tools/call_frequency": 5.7, + "tools/failure_frequency": 0.3851664662361145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 337.8, + "completions/max_terminated_length": 273.4, + "completions/mean_length": 313.45, + "completions/mean_terminated_length": 271.1, + "completions/min_length": 262.0, + "completions/min_terminated_length": 268.8, + "entropy": 0.04189231124473736, + "epoch": 0.41015625, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.11439066380262375, + "kl": 0.3796409673988819, + "learning_rate": 4.800000000000001e-06, + "loss": -0.05525785088539124, + "num_tokens": 308812.0, + "reward": 0.0569999985396862, + "reward_std": 0.006000000238418579, + "rewards/reward_func/mean": 0.0569999985396862, + "rewards/reward_func/std": 0.005999999865889549, + "step": 105, + "step_time": 20.08870074040169, + "tools/call_frequency": 6.3, + "tools/failure_frequency": 0.3948905646800995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 342.4, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 324.9, + "completions/mean_terminated_length": 314.3, + "completions/min_length": 294.4, + "completions/min_terminated_length": 299.6, + "entropy": 0.027399512368720024, + "epoch": 0.4296875, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.04786663129925728, + "kl": 0.35288467034697535, + "learning_rate": 4.5500000000000005e-06, + "loss": -0.010480480641126633, + "num_tokens": 324626.0, + "reward": 0.05399999842047691, + "reward_std": 0.012000000476837159, + "rewards/reward_func/mean": 0.05399999842047691, + "rewards/reward_func/std": 0.011999999731779098, + "step": 110, + "step_time": 20.751387215401337, + "tools/call_frequency": 6.05, + "tools/failure_frequency": 0.3629230856895447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65, + "completions/max_length": 335.0, + "completions/max_terminated_length": 250.6, + "completions/mean_length": 324.05, + "completions/mean_terminated_length": 247.85, + "completions/min_length": 306.4, + "completions/min_terminated_length": 244.4, + "entropy": 0.011146409685898107, + "epoch": 0.44921875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02133430540561676, + "kl": 0.31119330078363416, + "learning_rate": 4.3e-06, + "loss": 0.012449412047863007, + "num_tokens": 340312.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 115, + "step_time": 19.889900923999086, + "tools/call_frequency": 6.7, + "tools/failure_frequency": 0.364724200963974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7, + "completions/max_length": 338.4, + "completions/max_terminated_length": 254.8, + "completions/mean_length": 326.0, + "completions/mean_terminated_length": 253.13333740234376, + "completions/min_length": 310.8, + "completions/min_terminated_length": 250.0, + "entropy": 0.0049434174259658905, + "epoch": 0.46875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07528958469629288, + "kl": 0.2956920139491558, + "learning_rate": 4.05e-06, + "loss": 0.011839952319860458, + "num_tokens": 356100.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 120, + "step_time": 20.338967355799106, + "tools/call_frequency": 6.6, + "tools/failure_frequency": 0.37093899846076966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 328.4, + "completions/max_terminated_length": 266.6, + "completions/mean_length": 292.65, + "completions/mean_terminated_length": 243.46666870117187, + "completions/min_length": 212.6, + "completions/min_terminated_length": 220.8, + "entropy": 0.010821938829030842, + "epoch": 0.48828125, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.02426336519420147, + "kl": 0.3538802661001682, + "learning_rate": 3.8000000000000005e-06, + "loss": -0.12166147232055664, + "num_tokens": 371141.0, + "reward": 0.05399999842047691, + "reward_std": 0.012000000476837159, + "rewards/reward_func/mean": 0.05399999842047691, + "rewards/reward_func/std": 0.011999999731779098, + "step": 125, + "step_time": 19.236361916999158, + "tools/call_frequency": 6.5, + "tools/failure_frequency": 0.3801395773887634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15, + "completions/max_length": 323.4, + "completions/max_terminated_length": 316.6, + "completions/mean_length": 311.1, + "completions/mean_terminated_length": 309.3333374023438, + "completions/min_length": 301.6, + "completions/min_terminated_length": 302.8, + "entropy": 0.003383969687274657, + "epoch": 0.5078125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025379447266459465, + "kl": 0.28744339048862455, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.011490081995725631, + "num_tokens": 386449.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 130, + "step_time": 17.496410710996862, + "tools/call_frequency": 7.45, + "tools/failure_frequency": 0.3971812605857849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 336.2, + "completions/max_terminated_length": 320.8, + "completions/mean_length": 319.85, + "completions/mean_terminated_length": 315.1333374023437, + "completions/min_length": 303.8, + "completions/min_terminated_length": 310.4, + "entropy": 0.0029613528924528508, + "epoch": 0.52734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029231328517198563, + "kl": 0.28883620277047156, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.011552013456821442, + "num_tokens": 402092.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 135, + "step_time": 19.767896748798375, + "tools/call_frequency": 6.95, + "tools/failure_frequency": 0.3816740334033966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 336.8, + "completions/max_terminated_length": 303.4, + "completions/mean_length": 314.65, + "completions/mean_terminated_length": 297.0666748046875, + "completions/min_length": 291.8, + "completions/min_terminated_length": 291.8, + "entropy": 0.0022313889967335854, + "epoch": 0.546875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006076246965676546, + "kl": 0.2769774109125137, + "learning_rate": 3.05e-06, + "loss": 0.011077215522527694, + "num_tokens": 417523.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 140, + "step_time": 19.793050527399465, + "tools/call_frequency": 7.05, + "tools/failure_frequency": 0.36877211928367615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.35, + "completions/max_length": 334.8, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 317.05, + "completions/mean_terminated_length": 311.78333740234376, + "completions/min_length": 299.8, + "completions/min_terminated_length": 302.2, + "entropy": 0.0023406802676618097, + "epoch": 0.56640625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016859112307429314, + "kl": 0.27094974666833876, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.010821019113063813, + "num_tokens": 433028.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 145, + "step_time": 19.179231669600995, + "tools/call_frequency": 7.1, + "tools/failure_frequency": 0.3943915367126465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 333.4, + "completions/max_terminated_length": 324.2, + "completions/mean_length": 321.25, + "completions/mean_terminated_length": 315.7, + "completions/min_length": 299.6, + "completions/min_terminated_length": 304.6, + "entropy": 0.002267455725814216, + "epoch": 0.5859375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026256700977683067, + "kl": 0.27512960955500604, + "learning_rate": 2.55e-06, + "loss": 0.010997948795557022, + "num_tokens": 448676.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 150, + "step_time": 18.90324738700001, + "tools/call_frequency": 6.9, + "tools/failure_frequency": 0.39089863300323485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 336.4, + "completions/max_terminated_length": 320.8, + "completions/mean_length": 322.25, + "completions/mean_terminated_length": 316.3666687011719, + "completions/min_length": 304.2, + "completions/min_terminated_length": 311.8, + "entropy": 0.001911457245296333, + "epoch": 0.60546875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03210851177573204, + "kl": 0.2622846975922585, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.010500120371580124, + "num_tokens": 464367.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 155, + "step_time": 19.15542162460188, + "tools/call_frequency": 6.8, + "tools/failure_frequency": 0.38988603949546813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 333.0, + "completions/max_terminated_length": 253.4, + "completions/mean_length": 318.85, + "completions/mean_terminated_length": 250.6, + "completions/min_length": 307.4, + "completions/min_terminated_length": 247.0, + "entropy": 0.003048306703567505, + "epoch": 0.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03355943784117699, + "kl": 0.2786222040653229, + "learning_rate": 2.05e-06, + "loss": 0.011140207946300506, + "num_tokens": 479978.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 160, + "step_time": 19.03406370099983, + "tools/call_frequency": 6.85, + "tools/failure_frequency": 0.4012160897254944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 337.4, + "completions/max_terminated_length": 318.8, + "completions/mean_length": 320.15, + "completions/mean_terminated_length": 311.73333740234375, + "completions/min_length": 303.2, + "completions/min_terminated_length": 304.6, + "entropy": 0.0024299834709381684, + "epoch": 0.64453125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00971182156354189, + "kl": 0.26761081293225286, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.010710306465625763, + "num_tokens": 495637.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 165, + "step_time": 19.779936843597532, + "tools/call_frequency": 6.9, + "tools/failure_frequency": 0.39147052764892576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45, + "completions/max_length": 335.0, + "completions/max_terminated_length": 317.4, + "completions/mean_length": 320.7, + "completions/mean_terminated_length": 313.7166687011719, + "completions/min_length": 303.6, + "completions/min_terminated_length": 308.6, + "entropy": 0.0015991570777259766, + "epoch": 0.6640625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0209937933832407, + "kl": 0.2613979808986187, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.010462091118097306, + "num_tokens": 511254.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 170, + "step_time": 18.80054657560031, + "tools/call_frequency": 7.0, + "tools/failure_frequency": 0.3783483743667603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6, + "completions/max_length": 336.0, + "completions/max_terminated_length": 325.4, + "completions/mean_length": 321.4, + "completions/mean_terminated_length": 317.1, + "completions/min_length": 298.4, + "completions/min_terminated_length": 308.8, + "entropy": 0.0019770772167248653, + "epoch": 0.68359375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030799515545368195, + "kl": 0.2681220337748528, + "learning_rate": 1.3e-06, + "loss": 0.01071011796593666, + "num_tokens": 526902.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 175, + "step_time": 20.021315099197817, + "tools/call_frequency": 6.75, + "tools/failure_frequency": 0.370084285736084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4, + "completions/max_length": 332.4, + "completions/max_terminated_length": 318.2, + "completions/mean_length": 317.5, + "completions/mean_terminated_length": 311.3333374023438, + "completions/min_length": 299.8, + "completions/min_terminated_length": 304.4, + "entropy": 0.0017659664415987208, + "epoch": 0.703125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012540716677904129, + "kl": 0.26256203353405, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.010497495532035828, + "num_tokens": 542478.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 180, + "step_time": 19.730400246601494, + "tools/call_frequency": 6.95, + "tools/failure_frequency": 0.38820105195045473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 337.2, + "completions/max_terminated_length": 259.8, + "completions/mean_length": 323.55, + "completions/mean_terminated_length": 254.73333740234375, + "completions/min_length": 306.2, + "completions/min_terminated_length": 249.4, + "entropy": 0.001894339967839187, + "epoch": 0.72265625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019277706742286682, + "kl": 0.25229193195700644, + "learning_rate": 8.000000000000001e-07, + "loss": 0.010086478292942047, + "num_tokens": 558253.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 185, + "step_time": 19.658330725000997, + "tools/call_frequency": 6.7, + "tools/failure_frequency": 0.3805657267570496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 336.0, + "completions/max_terminated_length": 317.4, + "completions/mean_length": 322.9, + "completions/mean_terminated_length": 313.75, + "completions/min_length": 300.0, + "completions/min_terminated_length": 309.2, + "entropy": 0.0026780343818245456, + "epoch": 0.7421875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01523889135569334, + "kl": 0.26018148586153983, + "learning_rate": 5.5e-07, + "loss": 0.010390447080135345, + "num_tokens": 573937.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 190, + "step_time": 19.327826142799314, + "tools/call_frequency": 6.7, + "tools/failure_frequency": 0.3721375405788422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 336.2, + "completions/max_terminated_length": 315.6, + "completions/mean_length": 320.45, + "completions/mean_terminated_length": 308.2, + "completions/min_length": 297.2, + "completions/min_terminated_length": 300.6, + "entropy": 0.0018196408695075662, + "epoch": 0.76171875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010020147077739239, + "kl": 0.28274188563227654, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.011302116513252258, + "num_tokens": 589569.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 195, + "step_time": 19.777142194598856, + "tools/call_frequency": 6.85, + "tools/failure_frequency": 0.3721937298774719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.55, + "completions/max_length": 336.2, + "completions/max_terminated_length": 319.8, + "completions/mean_length": 320.25, + "completions/mean_terminated_length": 315.95, + "completions/min_length": 299.6, + "completions/min_terminated_length": 313.0, + "entropy": 0.0022158807900268585, + "epoch": 0.78125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022795701399445534, + "kl": 0.2784128800034523, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.011128613352775573, + "num_tokens": 605249.0, + "reward": 0.05999999865889549, + "reward_std": 0.0, + "rewards/reward_func/mean": 0.05999999865889549, + "rewards/reward_func/std": 0.0, + "step": 200, + "step_time": 19.714544834000115, + "tools/call_frequency": 6.95, + "tools/failure_frequency": 0.38814000487327577 + } + ], + "logging_steps": 5, + "max_steps": 200, + "num_input_tokens_seen": 605249, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000..5bea77b --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2ebe18213235a9211aa4bf0c3778da36ad2990f51c4dddec3ffae0a6be7033 +size 7185 diff --git a/completions/completions_00005.parquet b/completions/completions_00005.parquet new file mode 100644 index 0000000..00ed681 --- /dev/null +++ b/completions/completions_00005.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ab7c2da42d233760f178da49197fd96ab585bb5c5d1db708f471dc41028203 +size 17624 diff --git a/completions/completions_00010.parquet b/completions/completions_00010.parquet new file mode 100644 index 0000000..9d13540 --- /dev/null +++ b/completions/completions_00010.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7db68983abc099fefc535827202de2cb4aeaa566e1c983efdbcf8972f1e5cb30 +size 20558 diff --git a/completions/completions_00015.parquet b/completions/completions_00015.parquet new file mode 100644 index 0000000..bf55901 --- /dev/null +++ b/completions/completions_00015.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a1800427305524b201307a5970dc21a24d19d3fdafdee262e2387f7b34cf983 +size 24290 diff --git a/completions/completions_00020.parquet b/completions/completions_00020.parquet new file mode 100644 index 0000000..a8d9204 --- /dev/null +++ b/completions/completions_00020.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d482666c7fb92be81e526c3059fc05bb4c92dc8b5e7e9dd5fec5123c75e1ec1d +size 21178 diff --git a/completions/completions_00025.parquet b/completions/completions_00025.parquet new file mode 100644 index 0000000..209ca5c --- /dev/null +++ b/completions/completions_00025.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ae4d08fd1f018e00b737a96f040274e93cc7d67f08cba09703b555a1a9b951e +size 22322 diff --git a/completions/completions_00030.parquet b/completions/completions_00030.parquet new file mode 100644 index 0000000..502d9eb --- /dev/null +++ b/completions/completions_00030.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:858b7d342929fdf3548afd2b111fb23d0761293bced38847e819daa6ab55e0d5 +size 22211 diff --git a/completions/completions_00035.parquet b/completions/completions_00035.parquet new file mode 100644 index 0000000..f6a3342 --- /dev/null +++ b/completions/completions_00035.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8512e7fda86859f68b1bd03422d382cdc0f92a31f1d1c6333b3c0e3d33d3b53a +size 22642 diff --git a/completions/completions_00040.parquet b/completions/completions_00040.parquet new file mode 100644 index 0000000..1ba4e29 --- /dev/null +++ b/completions/completions_00040.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d98816af41b2c123868346fa968a70a76279aa5cdf21562b2489d7369cfb0f78 +size 19277 diff --git a/completions/completions_00045.parquet b/completions/completions_00045.parquet new file mode 100644 index 0000000..f7445a9 --- /dev/null +++ b/completions/completions_00045.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75141dd7b55994f1234eeb624f867eefe4a31586def8b732cbea4d9b8fd632b8 +size 22253 diff --git a/completions/completions_00050.parquet b/completions/completions_00050.parquet new file mode 100644 index 0000000..959ce7d --- /dev/null +++ b/completions/completions_00050.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58a8048eab063416839085dd682602db54e6a20c8fd6b7c8cb326d54c27ac78 +size 22194 diff --git a/completions/completions_00055.parquet b/completions/completions_00055.parquet new file mode 100644 index 0000000..74ab96c --- /dev/null +++ b/completions/completions_00055.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b515992f42d97e23f62e1744146560c554897c9b15e66a474debff3b350646fd +size 22358 diff --git a/completions/completions_00060.parquet b/completions/completions_00060.parquet new file mode 100644 index 0000000..aec77f9 --- /dev/null +++ b/completions/completions_00060.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a0ece6069a589efc109aecb5be6d13798087043e77aff2ca58233e69e0a3f96 +size 22223 diff --git a/completions/completions_00065.parquet b/completions/completions_00065.parquet new file mode 100644 index 0000000..596fde8 --- /dev/null +++ b/completions/completions_00065.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44dada6eeeafb0b2b767fe650e604491abbbac436d654703407384a88047bf67 +size 22889 diff --git a/completions/completions_00070.parquet b/completions/completions_00070.parquet new file mode 100644 index 0000000..a197d70 --- /dev/null +++ b/completions/completions_00070.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e1253fc444f5efdad07ef30ca8f20a8b83f1f9ce0af8d39009d7a973ac1f70 +size 21900 diff --git a/completions/completions_00075.parquet b/completions/completions_00075.parquet new file mode 100644 index 0000000..ce95efc --- /dev/null +++ b/completions/completions_00075.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:780d61d130ca7e9e05a7f2b99f06a15fb2c17eeba1c257c1ceaff9f46e82dc69 +size 21867 diff --git a/completions/completions_00080.parquet b/completions/completions_00080.parquet new file mode 100644 index 0000000..b62a52a --- /dev/null +++ b/completions/completions_00080.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5bef0e99a5a7aad1e056c03e775098453ecbe0fb28d7bffa60f87fcefe1fb9 +size 21989 diff --git a/completions/completions_00085.parquet b/completions/completions_00085.parquet new file mode 100644 index 0000000..ebb969e --- /dev/null +++ b/completions/completions_00085.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cd61a7bd01ca7ab93b6c3291c1d9dc85606bed2b8db79bfce272fe9cf305f2c +size 22572 diff --git a/completions/completions_00090.parquet b/completions/completions_00090.parquet new file mode 100644 index 0000000..05d3fb0 --- /dev/null +++ b/completions/completions_00090.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e7cb80425b1b572a140591eb5cfe57e98953833003a2ade65b11a5dd422cc93 +size 23665 diff --git a/completions/completions_00095.parquet b/completions/completions_00095.parquet new file mode 100644 index 0000000..3945bdc --- /dev/null +++ b/completions/completions_00095.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7823bad70898155273e09b219c9c0d95d9c656fa2f11fcb3fa9e6de578c6b7e3 +size 23250 diff --git a/completions/completions_00100.parquet b/completions/completions_00100.parquet new file mode 100644 index 0000000..581383a --- /dev/null +++ b/completions/completions_00100.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a513a1560745ea59d087078dff5f0c0de3a80d1fa2a33ccf5bff76d14ac5f8f +size 21353 diff --git a/completions/completions_00105.parquet b/completions/completions_00105.parquet new file mode 100644 index 0000000..8b53909 --- /dev/null +++ b/completions/completions_00105.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b601fe61b78bb3d894f54f5081fd0f57dc1c783fd3504b48f0279a896e769816 +size 23294 diff --git a/completions/completions_00110.parquet b/completions/completions_00110.parquet new file mode 100644 index 0000000..c139a33 --- /dev/null +++ b/completions/completions_00110.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e0dc22f2e430371e6db825b9a5735844c317c7fe36914a2ed7755dddf904e7 +size 22884 diff --git a/completions/completions_00115.parquet b/completions/completions_00115.parquet new file mode 100644 index 0000000..6783827 --- /dev/null +++ b/completions/completions_00115.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebeaec160d6064acd5e730fe4137817967eca9506d018cd4bc14c87e201186e9 +size 23448 diff --git a/completions/completions_00120.parquet b/completions/completions_00120.parquet new file mode 100644 index 0000000..0b4755a --- /dev/null +++ b/completions/completions_00120.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f477ade091dc47cfe6c8b966e04cf6c3923083226d0f944d57a793d059b529 +size 23378 diff --git a/completions/completions_00125.parquet b/completions/completions_00125.parquet new file mode 100644 index 0000000..1d58fd4 --- /dev/null +++ b/completions/completions_00125.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8538a35efa7f965f554810d3ea9eab724cae895c5e0587f13f8817b0028243ab +size 23185 diff --git a/completions/completions_00130.parquet b/completions/completions_00130.parquet new file mode 100644 index 0000000..e3d1d58 --- /dev/null +++ b/completions/completions_00130.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9812ce3ffac78c475eeda65574ae001fb1684e882291a2df47173e6815a1859e +size 23326 diff --git a/completions/completions_00135.parquet b/completions/completions_00135.parquet new file mode 100644 index 0000000..edbfee0 --- /dev/null +++ b/completions/completions_00135.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1d7862ab580a97abb370c2f50afba94756b178c876390ba0d5a13d4b65c976d +size 23342 diff --git a/completions/completions_00140.parquet b/completions/completions_00140.parquet new file mode 100644 index 0000000..1d280a9 --- /dev/null +++ b/completions/completions_00140.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f4caaf47a7d8a855606891bf41e1d88b5324d75cad3037f14a1101e8894e636 +size 23026 diff --git a/completions/completions_00145.parquet b/completions/completions_00145.parquet new file mode 100644 index 0000000..54e2529 --- /dev/null +++ b/completions/completions_00145.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ff215067e86e4a01f8d179f2904c5d8148ff2b8938211cf1ea49e6aef9a02ec +size 23378 diff --git a/completions/completions_00150.parquet b/completions/completions_00150.parquet new file mode 100644 index 0000000..40481ed --- /dev/null +++ b/completions/completions_00150.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0617ad84dcdfc1a69afadb7c863b453c20c8f782ef25c7254c2da3fd11e2255c +size 23055 diff --git a/completions/completions_00155.parquet b/completions/completions_00155.parquet new file mode 100644 index 0000000..5b3f75b --- /dev/null +++ b/completions/completions_00155.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ca7fb852427d6566f33a2325ea65b917f5b7292204ace354c2c7fa4e4a80f60 +size 22866 diff --git a/completions/completions_00160.parquet b/completions/completions_00160.parquet new file mode 100644 index 0000000..1c6ad0e --- /dev/null +++ b/completions/completions_00160.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9907c048edeee6fadc783145ce64c01b376c539a103c0e644ce2e677d0f287fe +size 23442 diff --git a/completions/completions_00165.parquet b/completions/completions_00165.parquet new file mode 100644 index 0000000..c979ec6 --- /dev/null +++ b/completions/completions_00165.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49ddf82301b0ec8abad785429f6fe5440d5bed881f733c8170a956bdbc8e4390 +size 23430 diff --git a/completions/completions_00170.parquet b/completions/completions_00170.parquet new file mode 100644 index 0000000..b0b03b2 --- /dev/null +++ b/completions/completions_00170.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26b9bab096aaefd00e656d5ae16ff0f0fc82d2389e725c87cf0efee12e1f342 +size 22986 diff --git a/completions/completions_00175.parquet b/completions/completions_00175.parquet new file mode 100644 index 0000000..1997399 --- /dev/null +++ b/completions/completions_00175.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7600cab610b3bb10df6ffe52d4e13559f3e3be83731c67540b75ecd843553818 +size 23545 diff --git a/completions/completions_00180.parquet b/completions/completions_00180.parquet new file mode 100644 index 0000000..c0564c5 --- /dev/null +++ b/completions/completions_00180.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fa5443d40c2bdacae85a44f26e3398bc1cd449d91477c55c8873d82b2404bc8 +size 22972 diff --git a/completions/completions_00185.parquet b/completions/completions_00185.parquet new file mode 100644 index 0000000..be03d56 --- /dev/null +++ b/completions/completions_00185.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:529623709fb1db4175d4f69d548412559e4f6cdc8fe3a35b121cfbe888c69ec0 +size 22926 diff --git a/completions/completions_00190.parquet b/completions/completions_00190.parquet new file mode 100644 index 0000000..67982c6 --- /dev/null +++ b/completions/completions_00190.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62ad17103876fa42974b867351fe7f390fed3d4fded4f561267fb3a55ff20090 +size 23019 diff --git a/completions/completions_00195.parquet b/completions/completions_00195.parquet new file mode 100644 index 0000000..0c88054 --- /dev/null +++ b/completions/completions_00195.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34cad03b71c628ce49b96ba1ffae17b54fcd1ad3ee470b698cb50cf9c0290234 +size 23208 diff --git a/completions/completions_00200.parquet b/completions/completions_00200.parquet new file mode 100644 index 0000000..89107ab --- /dev/null +++ b/completions/completions_00200.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc43295443585b75cfb120dd85ca83ca491fbdc46388e43497cbad0958748ed +size 23493 diff --git a/config.json b/config.json new file mode 100644 index 0000000..9b2e878 --- /dev/null +++ b/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.6.2", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/eval/trained_eval_rows.csv b/eval/trained_eval_rows.csv new file mode 100644 index 0000000..f594d5d --- /dev/null +++ b/eval/trained_eval_rows.csv @@ -0,0 +1,151 @@ +agent,seed,case_id,difficulty,reward,primary_reward,auxiliary_reward,contradictions_total,contradictions_triggered,contradictions_surfaced,questions_used,evidence_presented,evidence_timing_successes,blind_evidence_count,useless_questions_ratio,avg_question_length,model_repo,invalid_tool_calls +random,20260425,timeline_255d67,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,, +random,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,, +random,20260427,workplace_c98377,easy,0.0,0.0,-0.4,1,0,0,3,5,0,5,1.0,5.0,, +random,20260428,motive_66ff59,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,, +random,20260429,timeline_19bb78,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,, +random,20260430,timeline_a97690,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,, +random,20260431,alibi_67ffcd,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,, +random,20260432,alibi_423bca,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260433,knowledge_960d07,medium,0.0,0.0,-0.4,2,0,0,8,0,0,0,1.0,5.0,, +random,20260434,alibi_e829c1,easy,0.0,0.0,-0.4,1,0,0,7,1,0,1,1.0,5.0,, +random,20260435,motive_85e25b,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,, +random,20260436,knowledge_a599e3,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,, +random,20260437,motive_8bca20,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,, +random,20260438,corporate_6b1664,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,, +random,20260439,alibi_a6c582,easy,0.0,0.0,-0.4,1,0,0,8,0,0,0,1.0,5.0,, +random,20260440,workplace_835476,easy,0.0,0.0,-0.4,1,0,0,5,3,0,3,1.0,5.0,, +random,20260441,possession_a079c5,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,, +random,20260442,possession_9cc45d,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,, +random,20260443,possession_259aa5,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,, +random,20260444,corporate_76724c,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260445,timeline_767821,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260446,motive_c0d166,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260447,corporate_307934,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,, +random,20260448,timeline_592816,hard,0.0,0.0,-0.4,3,0,0,6,2,0,2,1.0,5.0,, +random,20260449,knowledge_b26824,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260450,knowledge_697785,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,, +random,20260451,timeline_81dafd,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,, +random,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,, +random,20260453,possession_dbb5fe,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,, +random,20260454,alibi_a4666f,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,, +keyword_spam,20260425,timeline_255d67,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260426,knowledge_b28f8c,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260427,workplace_c98377,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260428,motive_66ff59,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260429,timeline_19bb78,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260430,timeline_a97690,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260431,alibi_67ffcd,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260432,alibi_423bca,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260433,knowledge_960d07,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260434,alibi_e829c1,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260435,motive_85e25b,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260436,knowledge_a599e3,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260437,motive_8bca20,easy,0.0,0.0,-0.15000000000000002,1,0,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260438,corporate_6b1664,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260439,alibi_a6c582,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260440,workplace_835476,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260441,possession_a079c5,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260442,possession_9cc45d,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260444,corporate_76724c,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260445,timeline_767821,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260446,motive_c0d166,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260447,corporate_307934,hard,0.020000000000000007,0.0,0.10000000000000003,3,1,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260448,timeline_592816,hard,0.19,0.0,0.95,3,3,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260449,knowledge_b26824,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260450,knowledge_697785,hard,0.12000000000000002,0.0,0.6000000000000001,3,2,0,5,0,0,0,0.4,4.2,, +keyword_spam,20260451,timeline_81dafd,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,, +keyword_spam,20260452,corporate_8eb7d7,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,, +keyword_spam,20260453,possession_dbb5fe,medium,0.030000000000000006,0.0,0.15000000000000002,2,1,0,5,0,0,0,0.6,4.2,, +keyword_spam,20260454,alibi_a4666f,hard,0.15000000000000002,0.0,0.75,3,2,0,5,0,0,0,0.2,4.2,, +present_all,20260425,timeline_255d67,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260427,workplace_c98377,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260428,motive_66ff59,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,, +present_all,20260429,timeline_19bb78,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260430,timeline_a97690,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260431,alibi_67ffcd,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260432,alibi_423bca,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260433,knowledge_960d07,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260434,alibi_e829c1,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,, +present_all,20260435,motive_85e25b,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,, +present_all,20260436,knowledge_a599e3,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260437,motive_8bca20,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260438,corporate_6b1664,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260439,alibi_a6c582,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,, +present_all,20260440,workplace_835476,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,, +present_all,20260441,possession_a079c5,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,, +present_all,20260442,possession_9cc45d,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,, +present_all,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,, +present_all,20260444,corporate_76724c,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260445,timeline_767821,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260446,motive_c0d166,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260447,corporate_307934,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,, +present_all,20260448,timeline_592816,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,, +present_all,20260449,knowledge_b26824,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260450,knowledge_697785,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,, +present_all,20260451,timeline_81dafd,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,, +present_all,20260453,possession_dbb5fe,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,, +present_all,20260454,alibi_a4666f,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,, +scripted_oracle,20260425,timeline_255d67,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260426,knowledge_b28f8c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,, +scripted_oracle,20260427,workplace_c98377,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260428,motive_66ff59,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260429,timeline_19bb78,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260430,timeline_a97690,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,, +scripted_oracle,20260431,alibi_67ffcd,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,, +scripted_oracle,20260432,alibi_423bca,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,, +scripted_oracle,20260433,knowledge_960d07,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,, +scripted_oracle,20260434,alibi_e829c1,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,, +scripted_oracle,20260435,motive_85e25b,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260436,knowledge_a599e3,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,, +scripted_oracle,20260437,motive_8bca20,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260438,corporate_6b1664,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260439,alibi_a6c582,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,, +scripted_oracle,20260440,workplace_835476,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260441,possession_a079c5,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260442,possession_9cc45d,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260443,possession_259aa5,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,, +scripted_oracle,20260444,corporate_76724c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260445,timeline_767821,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,, +scripted_oracle,20260446,motive_c0d166,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260447,corporate_307934,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260448,timeline_592816,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,, +scripted_oracle,20260449,knowledge_b26824,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,, +scripted_oracle,20260450,knowledge_697785,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.6666666666666667,, +scripted_oracle,20260451,timeline_81dafd,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,, +scripted_oracle,20260452,corporate_8eb7d7,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260453,possession_dbb5fe,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,, +scripted_oracle,20260454,alibi_a4666f,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,2.3333333333333335,, +trained_grpo_200,20260425,timeline_255d67,easy,0.0,0.0,0.0,1,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0 +trained_grpo_200,20260426,knowledge_b28f8c,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,43.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260427,workplace_c98377,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,29.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260428,motive_66ff59,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,33.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260429,timeline_19bb78,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,35.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260430,timeline_a97690,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,36.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260431,alibi_67ffcd,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260432,alibi_423bca,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260433,knowledge_960d07,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,41.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260434,alibi_e829c1,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260435,motive_85e25b,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,34.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260436,knowledge_a599e3,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,41.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260437,motive_8bca20,easy,0.0,0.0,0.0,1,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0 +trained_grpo_200,20260438,corporate_6b1664,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260439,alibi_a6c582,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260440,workplace_835476,easy,0.0,0.0,0.0,1,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0 +trained_grpo_200,20260441,possession_a079c5,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,49.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260442,possession_9cc45d,hard,0.0,0.0,0.0,3,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0 +trained_grpo_200,20260443,possession_259aa5,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,44.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260444,corporate_76724c,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,29.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260445,timeline_767821,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,37.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260446,motive_c0d166,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,34.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260447,corporate_307934,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260448,timeline_592816,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,35.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260449,knowledge_b26824,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,40.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260450,knowledge_697785,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,41.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260451,timeline_81dafd,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,34.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260452,corporate_8eb7d7,medium,0.0,0.0,0.0,2,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0 +trained_grpo_200,20260453,possession_dbb5fe,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,47.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 +trained_grpo_200,20260454,alibi_a4666f,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,37.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0 diff --git a/eval/trained_eval_rows.jsonl b/eval/trained_eval_rows.jsonl new file mode 100644 index 0000000..5aafd95 --- /dev/null +++ b/eval/trained_eval_rows.jsonl @@ -0,0 +1,150 @@ +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 3, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 1.0} +{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 1.0} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260425, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260426, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260428, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260429, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260430, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260431, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260432, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260433, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260434, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260435, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260436, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": -0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260438, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260439, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260441, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260442, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260444, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260445, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260446, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260447, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.95, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.19, "seed": 20260448, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260449, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260450, "useless_questions_ratio": 0.4} +{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260451, "useless_questions_ratio": 0.2} +{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260452, "useless_questions_ratio": 0.8} +{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260453, "useless_questions_ratio": 0.6} +{"agent": "keyword_spam", "auxiliary_reward": 0.75, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.15000000000000002, "seed": 20260454, "useless_questions_ratio": 0.2} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 0.0} +{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260425, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260426, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260427, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260428, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260429, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_a97690", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260430, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260431, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260432, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260433, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260434, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260435, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260436, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260437, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260438, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260439, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260440, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260441, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260442, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260443, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260444, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_767821", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260445, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260446, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260447, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260448, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260449, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.6666666666666667, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260450, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_81dafd", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260451, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260452, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260453, "useless_questions_ratio": 0.0} +{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 2.3333333333333335, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260454, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 43.0, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260426, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 29.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260427, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 33.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260428, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 35.0, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260429, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 36.0, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260430, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260431, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260432, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 41.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260433, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260434, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 34.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260435, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 41.0, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260436, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260438, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260439, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 49.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260441, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 44.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260443, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 29.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260444, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 37.0, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260445, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 34.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260446, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260447, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 35.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260448, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 40.0, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260449, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 41.0, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260450, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 34.0, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260451, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 47.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260453, "useless_questions_ratio": 0.0} +{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 37.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260454, "useless_questions_ratio": 0.0} diff --git a/eval/trained_eval_summary.json b/eval/trained_eval_summary.json new file mode 100644 index 0000000..b62f00e --- /dev/null +++ b/eval/trained_eval_summary.json @@ -0,0 +1,52 @@ +[ + { + "agent": "keyword_spam", + "avg_evidence_timing": 0, + "avg_primary_reward": 0.0, + "avg_reward": 0.07300000000000001, + "avg_surface_rate": 0.0, + "avg_trigger_rate": 0.6777777777777778, + "avg_useless_ratio": 0.48000000000000004, + "episodes": 30 + }, + { + "agent": "present_all", + "avg_evidence_timing": 0, + "avg_primary_reward": 0.0, + "avg_reward": 0.0, + "avg_surface_rate": 0.0, + "avg_trigger_rate": 0.0, + "avg_useless_ratio": 0.0, + "episodes": 30 + }, + { + "agent": "random", + "avg_evidence_timing": 0, + "avg_primary_reward": 0.0, + "avg_reward": 0.0, + "avg_surface_rate": 0.0, + "avg_trigger_rate": 0.0, + "avg_useless_ratio": 1.0, + "episodes": 30 + }, + { + "agent": "scripted_oracle", + "avg_evidence_timing": 1.9, + "avg_primary_reward": 0.95, + "avg_reward": 0.9023333333333334, + "avg_surface_rate": 0.95, + "avg_trigger_rate": 0.95, + "avg_useless_ratio": 0.0, + "episodes": 30 + }, + { + "agent": "trained_grpo_200", + "avg_evidence_timing": 0, + "avg_primary_reward": 0.0, + "avg_reward": 0.05000000000000001, + "avg_surface_rate": 0.0, + "avg_trigger_rate": 0.4611111111111111, + "avg_useless_ratio": 0.0, + "episodes": 30 + } +] \ No newline at end of file diff --git a/eval/trained_eval_transcripts.md b/eval/trained_eval_transcripts.md new file mode 100644 index 0000000..dcbeae4 --- /dev/null +++ b/eval/trained_eval_transcripts.md @@ -0,0 +1,57 @@ +# Agent: trained_grpo_200 + +## timeline_255d67 (easy) + +Esme Dubois is accused of assault at the public library on Walnut Street on May 28th. The victim claims the assault occurred at 11:45 PM. The defendant claims they were elsewhere at that time. + +### Transcript +- Step 1 `rest_case`: rest_case + - Witness: [Counsel rests.] + - Triggered: none; Surfaced: none + +### Reward +```json +{"primary_reward": 0.0, "auxiliary_reward_raw": 0.0, "total_reward": 0.0, "contradictions_total": 1.0, "contradictions_triggered": 0.0, "contradictions_surfaced": 0.0, "keyword_questions": 0.0, "evidence_timing_successes": 0.0, "blind_evidence_count": 0.0, "duplicate_questions": 0.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.0, "avg_question_length": 0.0} +``` + +--- + +# Agent: trained_grpo_200 + +## knowledge_b28f8c (medium) + +Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + +### Transcript +- Step 1 `ask_question`: Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer. + - Witness: I have never met Kavita Iyer in my life. + - Triggered: c1_know_victim; Surfaced: none +- Step 2 `rest_case`: rest_case + - Witness: [Counsel rests.] + - Triggered: c1_know_victim; Surfaced: none + +### Reward +```json +{"primary_reward": 0.0, "auxiliary_reward_raw": 0.30000000000000004, "total_reward": 0.06000000000000001, "contradictions_total": 2.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 0.0, "keyword_questions": 1.0, "evidence_timing_successes": 0.0, "blind_evidence_count": 0.0, "duplicate_questions": 0.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.0, "avg_question_length": 43.0} +``` + +--- + +# Agent: trained_grpo_200 + +## workplace_c98377 (easy) + +Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + +### Transcript +- Step 1 `ask_question`: Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby. + - Witness: I never sent Ravi Joshi any threatening message. + - Triggered: c1_message_denial; Surfaced: none +- Step 2 `rest_case`: rest_case + - Witness: [Counsel rests.] + - Triggered: c1_message_denial; Surfaced: none + +### Reward +```json +{"primary_reward": 0.0, "auxiliary_reward_raw": 0.30000000000000004, "total_reward": 0.06000000000000001, "contradictions_total": 1.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 0.0, "keyword_questions": 1.0, "evidence_timing_successes": 0.0, "blind_evidence_count": 0.0, "duplicate_questions": 0.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.0, "avg_question_length": 29.0} +``` \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..0ecce6e --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.6.2" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..cdba477 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a74651f230a1644b02fec7706be28f5209ebe746a4876a1df7c07a294a345f11 +size 2384234968 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..af5f35b --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,75 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "local_files_only": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "reasoning_content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object" + }, + "name": { + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "function" + } + }, + "type": "object", + "x-parser": "json", + "x-parser-args": { + "transform": "{type: 'function', function: @}" + } + }, + "type": "array", + "x-regex-iterator": "\\s*(.+?)\\s*" + } + }, + "type": "object", + "x-regex": "^(?:\\n?(?:(?P.*?\\S.*?)\\n?|[\\s]*)\\s*)?(?P.*?)(?:\\n(?=))?(?=(?:|<\\|im_end\\|>|$))(?P(?:.+?\\s*)+)?\\s*(?:<\\|im_end\\|>|$)" + }, + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "truncation_side": "left", + "unk_token": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..5bea77b --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2ebe18213235a9211aa4bf0c3778da36ad2990f51c4dddec3ffae0a6be7033 +size 7185 diff --git a/training_summary.json b/training_summary.json new file mode 100644 index 0000000..f9a444e --- /dev/null +++ b/training_summary.json @@ -0,0 +1,18 @@ +{ + "artifact_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", + "dataset_size": 256, + "env_url": "https://heavycoderhh-counsel-env.hf.space", + "max_completion_length": 512, + "max_steps": 200, + "metrics": { + "total_flos": 0.0, + "train_loss": -0.0162161529250443, + "train_runtime": 4111.2914, + "train_samples_per_second": 0.195, + "train_steps_per_second": 0.049 + }, + "model": "Qwen/Qwen3-0.6B", + "num_generations": 4, + "space_repo": "heavycoderhh/counsel-env", + "use_vllm": false +} \ No newline at end of file