初始化项目,由ModelHub XC社区提供模型

Model: heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-06-16 08:16:17 +08:00
commit d5d0e722af
93 changed files with 4106 additions and 0 deletions

151
eval/trained_eval_rows.csv Normal file
View File

@@ -0,0 +1,151 @@
agent,seed,case_id,difficulty,reward,primary_reward,auxiliary_reward,contradictions_total,contradictions_triggered,contradictions_surfaced,questions_used,evidence_presented,evidence_timing_successes,blind_evidence_count,useless_questions_ratio,avg_question_length,model_repo,invalid_tool_calls
random,20260425,timeline_255d67,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,,
random,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
random,20260427,workplace_c98377,easy,0.0,0.0,-0.4,1,0,0,3,5,0,5,1.0,5.0,,
random,20260428,motive_66ff59,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
random,20260429,timeline_19bb78,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,,
random,20260430,timeline_a97690,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
random,20260431,alibi_67ffcd,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
random,20260432,alibi_423bca,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
random,20260433,knowledge_960d07,medium,0.0,0.0,-0.4,2,0,0,8,0,0,0,1.0,5.0,,
random,20260434,alibi_e829c1,easy,0.0,0.0,-0.4,1,0,0,7,1,0,1,1.0,5.0,,
random,20260435,motive_85e25b,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,,
random,20260436,knowledge_a599e3,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
random,20260437,motive_8bca20,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,,
random,20260438,corporate_6b1664,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
random,20260439,alibi_a6c582,easy,0.0,0.0,-0.4,1,0,0,8,0,0,0,1.0,5.0,,
random,20260440,workplace_835476,easy,0.0,0.0,-0.4,1,0,0,5,3,0,3,1.0,5.0,,
random,20260441,possession_a079c5,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,,
random,20260442,possession_9cc45d,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,,
random,20260443,possession_259aa5,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,,
random,20260444,corporate_76724c,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
random,20260445,timeline_767821,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
random,20260446,motive_c0d166,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
random,20260447,corporate_307934,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
random,20260448,timeline_592816,hard,0.0,0.0,-0.4,3,0,0,6,2,0,2,1.0,5.0,,
random,20260449,knowledge_b26824,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
random,20260450,knowledge_697785,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
random,20260451,timeline_81dafd,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
random,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
random,20260453,possession_dbb5fe,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
random,20260454,alibi_a4666f,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,,
keyword_spam,20260425,timeline_255d67,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,,
keyword_spam,20260426,knowledge_b28f8c,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260427,workplace_c98377,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
keyword_spam,20260428,motive_66ff59,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,,
keyword_spam,20260429,timeline_19bb78,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,,
keyword_spam,20260430,timeline_a97690,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
keyword_spam,20260431,alibi_67ffcd,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260432,alibi_423bca,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260433,knowledge_960d07,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260434,alibi_e829c1,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260435,motive_85e25b,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,,
keyword_spam,20260436,knowledge_a599e3,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260437,motive_8bca20,easy,0.0,0.0,-0.15000000000000002,1,0,0,5,0,0,0,0.6,4.2,,
keyword_spam,20260438,corporate_6b1664,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
keyword_spam,20260439,alibi_a6c582,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260440,workplace_835476,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
keyword_spam,20260441,possession_a079c5,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,,
keyword_spam,20260442,possession_9cc45d,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,,
keyword_spam,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
keyword_spam,20260444,corporate_76724c,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
keyword_spam,20260445,timeline_767821,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
keyword_spam,20260446,motive_c0d166,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
keyword_spam,20260447,corporate_307934,hard,0.020000000000000007,0.0,0.10000000000000003,3,1,0,5,0,0,0,0.8,4.2,,
keyword_spam,20260448,timeline_592816,hard,0.19,0.0,0.95,3,3,0,5,0,0,0,0.2,4.2,,
keyword_spam,20260449,knowledge_b26824,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260450,knowledge_697785,hard,0.12000000000000002,0.0,0.6000000000000001,3,2,0,5,0,0,0,0.4,4.2,,
keyword_spam,20260451,timeline_81dafd,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
keyword_spam,20260452,corporate_8eb7d7,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
keyword_spam,20260453,possession_dbb5fe,medium,0.030000000000000006,0.0,0.15000000000000002,2,1,0,5,0,0,0,0.6,4.2,,
keyword_spam,20260454,alibi_a4666f,hard,0.15000000000000002,0.0,0.75,3,2,0,5,0,0,0,0.2,4.2,,
present_all,20260425,timeline_255d67,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
present_all,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
present_all,20260427,workplace_c98377,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
present_all,20260428,motive_66ff59,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
present_all,20260429,timeline_19bb78,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
present_all,20260430,timeline_a97690,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
present_all,20260431,alibi_67ffcd,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
present_all,20260432,alibi_423bca,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
present_all,20260433,knowledge_960d07,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
present_all,20260434,alibi_e829c1,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
present_all,20260435,motive_85e25b,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
present_all,20260436,knowledge_a599e3,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
present_all,20260437,motive_8bca20,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
present_all,20260438,corporate_6b1664,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
present_all,20260439,alibi_a6c582,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
present_all,20260440,workplace_835476,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
present_all,20260441,possession_a079c5,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
present_all,20260442,possession_9cc45d,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
present_all,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
present_all,20260444,corporate_76724c,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
present_all,20260445,timeline_767821,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
present_all,20260446,motive_c0d166,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
present_all,20260447,corporate_307934,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
present_all,20260448,timeline_592816,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
present_all,20260449,knowledge_b26824,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
present_all,20260450,knowledge_697785,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
present_all,20260451,timeline_81dafd,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
present_all,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
present_all,20260453,possession_dbb5fe,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
present_all,20260454,alibi_a4666f,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
scripted_oracle,20260425,timeline_255d67,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
scripted_oracle,20260426,knowledge_b28f8c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
scripted_oracle,20260427,workplace_c98377,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
scripted_oracle,20260428,motive_66ff59,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
scripted_oracle,20260429,timeline_19bb78,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
scripted_oracle,20260430,timeline_a97690,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
scripted_oracle,20260431,alibi_67ffcd,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,,
scripted_oracle,20260432,alibi_423bca,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,,
scripted_oracle,20260433,knowledge_960d07,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
scripted_oracle,20260434,alibi_e829c1,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,,
scripted_oracle,20260435,motive_85e25b,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
scripted_oracle,20260436,knowledge_a599e3,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
scripted_oracle,20260437,motive_8bca20,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
scripted_oracle,20260438,corporate_6b1664,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
scripted_oracle,20260439,alibi_a6c582,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,,
scripted_oracle,20260440,workplace_835476,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
scripted_oracle,20260441,possession_a079c5,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
scripted_oracle,20260442,possession_9cc45d,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
scripted_oracle,20260443,possession_259aa5,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
scripted_oracle,20260444,corporate_76724c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
scripted_oracle,20260445,timeline_767821,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
scripted_oracle,20260446,motive_c0d166,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
scripted_oracle,20260447,corporate_307934,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
scripted_oracle,20260448,timeline_592816,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
scripted_oracle,20260449,knowledge_b26824,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
scripted_oracle,20260450,knowledge_697785,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.6666666666666667,,
scripted_oracle,20260451,timeline_81dafd,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
scripted_oracle,20260452,corporate_8eb7d7,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
scripted_oracle,20260453,possession_dbb5fe,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
scripted_oracle,20260454,alibi_a4666f,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,2.3333333333333335,,
trained_sft_grpo_run2,20260425,timeline_255d67,easy,0.010000000000000004,0.0,0.05000000000000002,1,1,0,7,0,0,0,0.7142857142857143,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260426,knowledge_b28f8c,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,45.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260427,workplace_c98377,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,31.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260428,motive_66ff59,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,31.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260429,timeline_19bb78,easy,0.010000000000000004,0.0,0.05000000000000002,1,1,0,7,0,0,0,0.7142857142857143,31.428571428571427,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260430,timeline_a97690,medium,0.010000000000000004,0.0,0.05000000000000002,2,1,0,7,0,0,0,0.7142857142857143,36.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260431,alibi_67ffcd,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,35.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260432,alibi_423bca,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260433,knowledge_960d07,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260434,alibi_e829c1,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,37.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260435,motive_85e25b,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,32.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260436,knowledge_a599e3,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260437,motive_8bca20,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,30.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260438,corporate_6b1664,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260439,alibi_a6c582,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260440,workplace_835476,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,30.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260441,possession_a079c5,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,47.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260442,possession_9cc45d,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,43.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260443,possession_259aa5,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,7,1,1,0,0.8571428571428571,42.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260444,corporate_76724c,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260445,timeline_767821,medium,0.010000000000000004,0.0,0.05000000000000002,2,1,0,7,0,0,0,0.7142857142857143,44.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260446,motive_c0d166,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,35.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260447,corporate_307934,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260448,timeline_592816,hard,0.010000000000000004,0.0,0.05000000000000002,3,1,0,7,0,0,0,0.7142857142857143,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260449,knowledge_b26824,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260450,knowledge_697785,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,43.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260451,timeline_81dafd,medium,0.010000000000000004,0.0,0.05000000000000002,2,1,0,7,0,0,0,0.7142857142857143,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260452,corporate_8eb7d7,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260453,possession_dbb5fe,medium,0.42000000000000004,0.5,0.09999999999999998,2,1,1,7,1,1,0,0.8571428571428571,45.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
trained_sft_grpo_run2,20260454,alibi_a4666f,hard,0.2866666666666667,0.3333333333333333,0.09999999999999998,3,1,1,7,1,1,0,0.8571428571428571,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2,0.0
1 agent seed case_id difficulty reward primary_reward auxiliary_reward contradictions_total contradictions_triggered contradictions_surfaced questions_used evidence_presented evidence_timing_successes blind_evidence_count useless_questions_ratio avg_question_length model_repo invalid_tool_calls
2 random 20260425 timeline_255d67 easy 0.0 0.0 -0.4 1 0 0 6 2 0 2 1.0 5.0
3 random 20260426 knowledge_b28f8c medium 0.0 0.0 -0.4 2 0 0 6 2 0 2 1.0 5.0
4 random 20260427 workplace_c98377 easy 0.0 0.0 -0.4 1 0 0 3 5 0 5 1.0 5.0
5 random 20260428 motive_66ff59 hard 0.0 0.0 -0.4 3 0 0 7 1 0 1 1.0 5.0
6 random 20260429 timeline_19bb78 easy 0.0 0.0 -0.4 1 0 0 4 4 0 4 1.0 5.0
7 random 20260430 timeline_a97690 medium 0.0 0.0 -0.4 2 0 0 6 2 0 2 1.0 5.0
8 random 20260431 alibi_67ffcd medium 0.0 0.0 -0.4 2 0 0 6 2 0 2 1.0 5.0
9 random 20260432 alibi_423bca medium 0.0 0.0 -0.4 2 0 0 5 3 0 3 1.0 5.0
10 random 20260433 knowledge_960d07 medium 0.0 0.0 -0.4 2 0 0 8 0 0 0 1.0 5.0
11 random 20260434 alibi_e829c1 easy 0.0 0.0 -0.4 1 0 0 7 1 0 1 1.0 5.0
12 random 20260435 motive_85e25b hard 0.0 0.0 -0.4 3 0 0 5 3 0 3 1.0 5.0
13 random 20260436 knowledge_a599e3 medium 0.0 0.0 -0.4 2 0 0 4 4 0 4 1.0 5.0
14 random 20260437 motive_8bca20 easy 0.0 0.0 -0.4 1 0 0 6 2 0 2 1.0 5.0
15 random 20260438 corporate_6b1664 medium 0.0 0.0 -0.4 2 0 0 6 2 0 2 1.0 5.0
16 random 20260439 alibi_a6c582 easy 0.0 0.0 -0.4 1 0 0 8 0 0 0 1.0 5.0
17 random 20260440 workplace_835476 easy 0.0 0.0 -0.4 1 0 0 5 3 0 3 1.0 5.0
18 random 20260441 possession_a079c5 hard 0.0 0.0 -0.4 3 0 0 8 0 0 0 1.0 5.0
19 random 20260442 possession_9cc45d hard 0.0 0.0 -0.4 3 0 0 5 3 0 3 1.0 5.0
20 random 20260443 possession_259aa5 easy 0.0 0.0 -0.4 1 0 0 4 4 0 4 1.0 5.0
21 random 20260444 corporate_76724c medium 0.0 0.0 -0.4 2 0 0 5 3 0 3 1.0 5.0
22 random 20260445 timeline_767821 medium 0.0 0.0 -0.4 2 0 0 5 3 0 3 1.0 5.0
23 random 20260446 motive_c0d166 medium 0.0 0.0 -0.4 2 0 0 5 3 0 3 1.0 5.0
24 random 20260447 corporate_307934 hard 0.0 0.0 -0.4 3 0 0 7 1 0 1 1.0 5.0
25 random 20260448 timeline_592816 hard 0.0 0.0 -0.4 3 0 0 6 2 0 2 1.0 5.0
26 random 20260449 knowledge_b26824 medium 0.0 0.0 -0.4 2 0 0 5 3 0 3 1.0 5.0
27 random 20260450 knowledge_697785 hard 0.0 0.0 -0.4 3 0 0 7 1 0 1 1.0 5.0
28 random 20260451 timeline_81dafd medium 0.0 0.0 -0.4 2 0 0 4 4 0 4 1.0 5.0
29 random 20260452 corporate_8eb7d7 medium 0.0 0.0 -0.4 2 0 0 4 4 0 4 1.0 5.0
30 random 20260453 possession_dbb5fe medium 0.0 0.0 -0.4 2 0 0 5 3 0 3 1.0 5.0
31 random 20260454 alibi_a4666f hard 0.0 0.0 -0.4 3 0 0 8 0 0 0 1.0 5.0
32 keyword_spam 20260425 timeline_255d67 easy 0.030000000000000006 0.0 0.15000000000000002 1 1 0 5 0 0 0 0.6 4.2
33 keyword_spam 20260426 knowledge_b28f8c medium 0.12000000000000002 0.0 0.6000000000000001 2 2 0 5 0 0 0 0.4 4.2
34 keyword_spam 20260427 workplace_c98377 easy 0.0 0.0 -0.2 1 0 0 5 0 0 0 0.8 4.2
35 keyword_spam 20260428 motive_66ff59 hard 0.17000000000000004 0.0 0.8500000000000001 3 3 0 5 0 0 0 0.2 4.2
36 keyword_spam 20260429 timeline_19bb78 easy 0.030000000000000006 0.0 0.15000000000000002 1 1 0 5 0 0 0 0.6 4.2
37 keyword_spam 20260430 timeline_a97690 medium 0.13 0.0 0.65 2 2 0 5 0 0 0 0.2 4.2
38 keyword_spam 20260431 alibi_67ffcd medium 0.04000000000000001 0.0 0.20000000000000004 2 1 0 5 0 0 0 0.4 4.2
39 keyword_spam 20260432 alibi_423bca medium 0.04000000000000001 0.0 0.20000000000000004 2 1 0 5 0 0 0 0.4 4.2
40 keyword_spam 20260433 knowledge_960d07 medium 0.12000000000000002 0.0 0.6000000000000001 2 2 0 5 0 0 0 0.4 4.2
41 keyword_spam 20260434 alibi_e829c1 easy 0.04000000000000001 0.0 0.20000000000000004 1 1 0 5 0 0 0 0.4 4.2
42 keyword_spam 20260435 motive_85e25b hard 0.17000000000000004 0.0 0.8500000000000001 3 3 0 5 0 0 0 0.2 4.2
43 keyword_spam 20260436 knowledge_a599e3 medium 0.12000000000000002 0.0 0.6000000000000001 2 2 0 5 0 0 0 0.4 4.2
44 keyword_spam 20260437 motive_8bca20 easy 0.0 0.0 -0.15000000000000002 1 0 0 5 0 0 0 0.6 4.2
45 keyword_spam 20260438 corporate_6b1664 medium 0.020000000000000007 0.0 0.10000000000000003 2 1 0 5 0 0 0 0.8 4.2
46 keyword_spam 20260439 alibi_a6c582 easy 0.04000000000000001 0.0 0.20000000000000004 1 1 0 5 0 0 0 0.4 4.2
47 keyword_spam 20260440 workplace_835476 easy 0.0 0.0 -0.2 1 0 0 5 0 0 0 0.8 4.2
48 keyword_spam 20260441 possession_a079c5 hard 0.030000000000000006 0.0 0.15000000000000002 3 1 0 5 0 0 0 0.6 4.2
49 keyword_spam 20260442 possession_9cc45d hard 0.030000000000000006 0.0 0.15000000000000002 3 1 0 5 0 0 0 0.6 4.2
50 keyword_spam 20260443 possession_259aa5 easy 0.0 0.0 -0.2 1 0 0 5 0 0 0 0.8 4.2
51 keyword_spam 20260444 corporate_76724c medium 0.020000000000000007 0.0 0.10000000000000003 2 1 0 5 0 0 0 0.8 4.2
52 keyword_spam 20260445 timeline_767821 medium 0.13 0.0 0.65 2 2 0 5 0 0 0 0.2 4.2
53 keyword_spam 20260446 motive_c0d166 medium 0.13 0.0 0.65 2 2 0 5 0 0 0 0.2 4.2
54 keyword_spam 20260447 corporate_307934 hard 0.020000000000000007 0.0 0.10000000000000003 3 1 0 5 0 0 0 0.8 4.2
55 keyword_spam 20260448 timeline_592816 hard 0.19 0.0 0.95 3 3 0 5 0 0 0 0.2 4.2
56 keyword_spam 20260449 knowledge_b26824 medium 0.12000000000000002 0.0 0.6000000000000001 2 2 0 5 0 0 0 0.4 4.2
57 keyword_spam 20260450 knowledge_697785 hard 0.12000000000000002 0.0 0.6000000000000001 3 2 0 5 0 0 0 0.4 4.2
58 keyword_spam 20260451 timeline_81dafd medium 0.13 0.0 0.65 2 2 0 5 0 0 0 0.2 4.2
59 keyword_spam 20260452 corporate_8eb7d7 medium 0.020000000000000007 0.0 0.10000000000000003 2 1 0 5 0 0 0 0.8 4.2
60 keyword_spam 20260453 possession_dbb5fe medium 0.030000000000000006 0.0 0.15000000000000002 2 1 0 5 0 0 0 0.6 4.2
61 keyword_spam 20260454 alibi_a4666f hard 0.15000000000000002 0.0 0.75 3 2 0 5 0 0 0 0.2 4.2
62 present_all 20260425 timeline_255d67 easy 0.0 0.0 -0.25 1 0 0 0 5 0 5 0.0 0.0
63 present_all 20260426 knowledge_b28f8c medium 0.0 0.0 -0.2 2 0 0 0 4 0 4 0.0 0.0
64 present_all 20260427 workplace_c98377 easy 0.0 0.0 -0.25 1 0 0 0 5 0 5 0.0 0.0
65 present_all 20260428 motive_66ff59 hard 0.0 0.0 -0.35000000000000003 3 0 0 0 7 0 7 0.0 0.0
66 present_all 20260429 timeline_19bb78 easy 0.0 0.0 -0.25 1 0 0 0 5 0 5 0.0 0.0
67 present_all 20260430 timeline_a97690 medium 0.0 0.0 -0.25 2 0 0 0 5 0 5 0.0 0.0
68 present_all 20260431 alibi_67ffcd medium 0.0 0.0 -0.2 2 0 0 0 4 0 4 0.0 0.0
69 present_all 20260432 alibi_423bca medium 0.0 0.0 -0.2 2 0 0 0 4 0 4 0.0 0.0
70 present_all 20260433 knowledge_960d07 medium 0.0 0.0 -0.2 2 0 0 0 4 0 4 0.0 0.0
71 present_all 20260434 alibi_e829c1 easy 0.0 0.0 -0.2 1 0 0 0 4 0 4 0.0 0.0
72 present_all 20260435 motive_85e25b hard 0.0 0.0 -0.35000000000000003 3 0 0 0 7 0 7 0.0 0.0
73 present_all 20260436 knowledge_a599e3 medium 0.0 0.0 -0.2 2 0 0 0 4 0 4 0.0 0.0
74 present_all 20260437 motive_8bca20 easy 0.0 0.0 -0.25 1 0 0 0 5 0 5 0.0 0.0
75 present_all 20260438 corporate_6b1664 medium 0.0 0.0 -0.25 2 0 0 0 5 0 5 0.0 0.0
76 present_all 20260439 alibi_a6c582 easy 0.0 0.0 -0.2 1 0 0 0 4 0 4 0.0 0.0
77 present_all 20260440 workplace_835476 easy 0.0 0.0 -0.25 1 0 0 0 5 0 5 0.0 0.0
78 present_all 20260441 possession_a079c5 hard 0.0 0.0 -0.30000000000000004 3 0 0 0 6 0 6 0.0 0.0
79 present_all 20260442 possession_9cc45d hard 0.0 0.0 -0.30000000000000004 3 0 0 0 6 0 6 0.0 0.0
80 present_all 20260443 possession_259aa5 easy 0.0 0.0 -0.2 1 0 0 0 4 0 4 0.0 0.0
81 present_all 20260444 corporate_76724c medium 0.0 0.0 -0.25 2 0 0 0 5 0 5 0.0 0.0
82 present_all 20260445 timeline_767821 medium 0.0 0.0 -0.25 2 0 0 0 5 0 5 0.0 0.0
83 present_all 20260446 motive_c0d166 medium 0.0 0.0 -0.25 2 0 0 0 5 0 5 0.0 0.0
84 present_all 20260447 corporate_307934 hard 0.0 0.0 -0.35000000000000003 3 0 0 0 7 0 7 0.0 0.0
85 present_all 20260448 timeline_592816 hard 0.0 0.0 -0.35000000000000003 3 0 0 0 7 0 7 0.0 0.0
86 present_all 20260449 knowledge_b26824 medium 0.0 0.0 -0.2 2 0 0 0 4 0 4 0.0 0.0
87 present_all 20260450 knowledge_697785 hard 0.0 0.0 -0.30000000000000004 3 0 0 0 6 0 6 0.0 0.0
88 present_all 20260451 timeline_81dafd medium 0.0 0.0 -0.25 2 0 0 0 5 0 5 0.0 0.0
89 present_all 20260452 corporate_8eb7d7 medium 0.0 0.0 -0.25 2 0 0 0 5 0 5 0.0 0.0
90 present_all 20260453 possession_dbb5fe medium 0.0 0.0 -0.2 2 0 0 0 4 0 4 0.0 0.0
91 present_all 20260454 alibi_a4666f hard 0.0 0.0 -0.30000000000000004 3 0 0 0 6 0 6 0.0 0.0
92 scripted_oracle 20260425 timeline_255d67 easy 0.8800000000000001 1.0 0.4 1 1 1 1 1 1 0 0.0 1.0
93 scripted_oracle 20260426 knowledge_b28f8c medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.5
94 scripted_oracle 20260427 workplace_c98377 easy 0.8800000000000001 1.0 0.4 1 1 1 1 1 1 0 0.0 1.0
95 scripted_oracle 20260428 motive_66ff59 hard 1.0 1.0 1.2000000000000002 3 3 3 3 3 3 0 0.0 1.0
96 scripted_oracle 20260429 timeline_19bb78 easy 0.8800000000000001 1.0 0.4 1 1 1 1 1 1 0 0.0 1.0
97 scripted_oracle 20260430 timeline_a97690 medium 0.49000000000000005 0.5 0.45 2 1 1 2 2 1 1 0.0 1.0
98 scripted_oracle 20260431 alibi_67ffcd medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 3.0
99 scripted_oracle 20260432 alibi_423bca medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 3.0
100 scripted_oracle 20260433 knowledge_960d07 medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.5
101 scripted_oracle 20260434 alibi_e829c1 easy 0.8800000000000001 1.0 0.4 1 1 1 1 1 1 0 0.0 3.0
102 scripted_oracle 20260435 motive_85e25b hard 1.0 1.0 1.2000000000000002 3 3 3 3 3 3 0 0.0 1.0
103 scripted_oracle 20260436 knowledge_a599e3 medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.5
104 scripted_oracle 20260437 motive_8bca20 easy 0.8800000000000001 1.0 0.4 1 1 1 1 1 1 0 0.0 1.0
105 scripted_oracle 20260438 corporate_6b1664 medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.0
106 scripted_oracle 20260439 alibi_a6c582 easy 0.8800000000000001 1.0 0.4 1 1 1 1 1 1 0 0.0 3.0
107 scripted_oracle 20260440 workplace_835476 easy 0.8800000000000001 1.0 0.4 1 1 1 1 1 1 0 0.0 1.0
108 scripted_oracle 20260441 possession_a079c5 hard 1.0 1.0 1.2000000000000002 3 3 3 3 3 3 0 0.0 1.0
109 scripted_oracle 20260442 possession_9cc45d hard 1.0 1.0 1.2000000000000002 3 3 3 3 3 3 0 0.0 1.0
110 scripted_oracle 20260443 possession_259aa5 easy 0.8800000000000001 1.0 0.4 1 1 1 1 1 1 0 0.0 1.0
111 scripted_oracle 20260444 corporate_76724c medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.0
112 scripted_oracle 20260445 timeline_767821 medium 0.49000000000000005 0.5 0.45 2 1 1 2 2 1 1 0.0 1.0
113 scripted_oracle 20260446 motive_c0d166 medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.0
114 scripted_oracle 20260447 corporate_307934 hard 1.0 1.0 1.2000000000000002 3 3 3 3 3 3 0 0.0 1.0
115 scripted_oracle 20260448 timeline_592816 hard 1.0 1.0 1.2000000000000002 3 3 3 3 3 3 0 0.0 1.0
116 scripted_oracle 20260449 knowledge_b26824 medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.5
117 scripted_oracle 20260450 knowledge_697785 hard 1.0 1.0 1.2000000000000002 3 3 3 3 3 3 0 0.0 1.6666666666666667
118 scripted_oracle 20260451 timeline_81dafd medium 0.49000000000000005 0.5 0.45 2 1 1 2 2 1 1 0.0 1.0
119 scripted_oracle 20260452 corporate_8eb7d7 medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.0
120 scripted_oracle 20260453 possession_dbb5fe medium 0.9600000000000001 1.0 0.8 2 2 2 2 2 2 0 0.0 1.0
121 scripted_oracle 20260454 alibi_a4666f hard 1.0 1.0 1.2000000000000002 3 3 3 3 3 3 0 0.0 2.3333333333333335
122 trained_sft_grpo_run2 20260425 timeline_255d67 easy 0.010000000000000004 0.0 0.05000000000000002 1 1 0 7 0 0 0 0.7142857142857143 39.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
123 trained_sft_grpo_run2 20260426 knowledge_b28f8c medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 45.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
124 trained_sft_grpo_run2 20260427 workplace_c98377 easy 0.8200000000000001 1.0 0.09999999999999998 1 1 1 7 1 1 0 0.8571428571428571 31.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
125 trained_sft_grpo_run2 20260428 motive_66ff59 hard 0.2866666666666667 0.3333333333333333 0.09999999999999998 3 1 1 7 1 1 0 0.8571428571428571 31.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
126 trained_sft_grpo_run2 20260429 timeline_19bb78 easy 0.010000000000000004 0.0 0.05000000000000002 1 1 0 7 0 0 0 0.7142857142857143 31.428571428571427 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
127 trained_sft_grpo_run2 20260430 timeline_a97690 medium 0.010000000000000004 0.0 0.05000000000000002 2 1 0 7 0 0 0 0.7142857142857143 36.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
128 trained_sft_grpo_run2 20260431 alibi_67ffcd medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 35.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
129 trained_sft_grpo_run2 20260432 alibi_423bca medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 39.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
130 trained_sft_grpo_run2 20260433 knowledge_960d07 medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 39.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
131 trained_sft_grpo_run2 20260434 alibi_e829c1 easy 0.8200000000000001 1.0 0.09999999999999998 1 1 1 7 1 1 0 0.8571428571428571 37.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
132 trained_sft_grpo_run2 20260435 motive_85e25b hard 0.2866666666666667 0.3333333333333333 0.09999999999999998 3 1 1 7 1 1 0 0.8571428571428571 32.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
133 trained_sft_grpo_run2 20260436 knowledge_a599e3 medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 39.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
134 trained_sft_grpo_run2 20260437 motive_8bca20 easy 0.8200000000000001 1.0 0.09999999999999998 1 1 1 7 1 1 0 0.8571428571428571 30.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
135 trained_sft_grpo_run2 20260438 corporate_6b1664 medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 27.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
136 trained_sft_grpo_run2 20260439 alibi_a6c582 easy 0.8200000000000001 1.0 0.09999999999999998 1 1 1 7 1 1 0 0.8571428571428571 38.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
137 trained_sft_grpo_run2 20260440 workplace_835476 easy 0.8200000000000001 1.0 0.09999999999999998 1 1 1 7 1 1 0 0.8571428571428571 30.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
138 trained_sft_grpo_run2 20260441 possession_a079c5 hard 0.2866666666666667 0.3333333333333333 0.09999999999999998 3 1 1 7 1 1 0 0.8571428571428571 47.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
139 trained_sft_grpo_run2 20260442 possession_9cc45d hard 0.2866666666666667 0.3333333333333333 0.09999999999999998 3 1 1 7 1 1 0 0.8571428571428571 43.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
140 trained_sft_grpo_run2 20260443 possession_259aa5 easy 0.8200000000000001 1.0 0.09999999999999998 1 1 1 7 1 1 0 0.8571428571428571 42.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
141 trained_sft_grpo_run2 20260444 corporate_76724c medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 27.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
142 trained_sft_grpo_run2 20260445 timeline_767821 medium 0.010000000000000004 0.0 0.05000000000000002 2 1 0 7 0 0 0 0.7142857142857143 44.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
143 trained_sft_grpo_run2 20260446 motive_c0d166 medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 35.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
144 trained_sft_grpo_run2 20260447 corporate_307934 hard 0.2866666666666667 0.3333333333333333 0.09999999999999998 3 1 1 7 1 1 0 0.8571428571428571 27.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
145 trained_sft_grpo_run2 20260448 timeline_592816 hard 0.010000000000000004 0.0 0.05000000000000002 3 1 0 7 0 0 0 0.7142857142857143 39.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
146 trained_sft_grpo_run2 20260449 knowledge_b26824 medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 38.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
147 trained_sft_grpo_run2 20260450 knowledge_697785 hard 0.2866666666666667 0.3333333333333333 0.09999999999999998 3 1 1 7 1 1 0 0.8571428571428571 43.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
148 trained_sft_grpo_run2 20260451 timeline_81dafd medium 0.010000000000000004 0.0 0.05000000000000002 2 1 0 7 0 0 0 0.7142857142857143 38.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
149 trained_sft_grpo_run2 20260452 corporate_8eb7d7 medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 27.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
150 trained_sft_grpo_run2 20260453 possession_dbb5fe medium 0.42000000000000004 0.5 0.09999999999999998 2 1 1 7 1 1 0 0.8571428571428571 45.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0
151 trained_sft_grpo_run2 20260454 alibi_a4666f hard 0.2866666666666667 0.3333333333333333 0.09999999999999998 3 1 1 7 1 1 0 0.8571428571428571 39.0 heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2 0.0

View File

@@ -0,0 +1,150 @@
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 3, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 1.0}
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 1.0}
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260425, "useless_questions_ratio": 0.6}
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260426, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.8}
{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260428, "useless_questions_ratio": 0.2}
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260429, "useless_questions_ratio": 0.6}
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260430, "useless_questions_ratio": 0.2}
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260431, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260432, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260433, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260434, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260435, "useless_questions_ratio": 0.2}
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260436, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": -0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.6}
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260438, "useless_questions_ratio": 0.8}
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260439, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.8}
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260441, "useless_questions_ratio": 0.6}
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260442, "useless_questions_ratio": 0.6}
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.8}
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260444, "useless_questions_ratio": 0.8}
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260445, "useless_questions_ratio": 0.2}
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260446, "useless_questions_ratio": 0.2}
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260447, "useless_questions_ratio": 0.8}
{"agent": "keyword_spam", "auxiliary_reward": 0.95, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.19, "seed": 20260448, "useless_questions_ratio": 0.2}
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260449, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260450, "useless_questions_ratio": 0.4}
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260451, "useless_questions_ratio": 0.2}
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260452, "useless_questions_ratio": 0.8}
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260453, "useless_questions_ratio": 0.6}
{"agent": "keyword_spam", "auxiliary_reward": 0.75, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.15000000000000002, "seed": 20260454, "useless_questions_ratio": 0.2}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 0.0}
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260425, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260426, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260427, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260428, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260429, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_a97690", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260430, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260431, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260432, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260433, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260434, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260435, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260436, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260437, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260438, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260439, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260440, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260441, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260442, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260443, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260444, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_767821", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260445, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260446, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260447, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260448, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260449, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.6666666666666667, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260450, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_81dafd", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260451, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260452, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260453, "useless_questions_ratio": 0.0}
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 2.3333333333333335, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260454, "useless_questions_ratio": 0.0}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260425, "useless_questions_ratio": 0.7142857142857143}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 45.0, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260426, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 31.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260427, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 31.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260428, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 31.428571428571427, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260429, "useless_questions_ratio": 0.7142857142857143}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 36.0, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260430, "useless_questions_ratio": 0.7142857142857143}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 35.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260431, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260432, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260433, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 37.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260434, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 32.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260435, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260436, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 30.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260437, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260438, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260439, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 30.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260440, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 47.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260441, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 43.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260442, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 42.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 1.0, "questions_used": 7, "reward": 0.8200000000000001, "seed": 20260443, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260444, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 44.0, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260445, "useless_questions_ratio": 0.7142857142857143}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 35.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260446, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260447, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260448, "useless_questions_ratio": 0.7142857142857143}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260449, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 43.0, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260450, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.05000000000000002, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.0, "questions_used": 7, "reward": 0.010000000000000004, "seed": 20260451, "useless_questions_ratio": 0.7142857142857143}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260452, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 45.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.5, "questions_used": 7, "reward": 0.42000000000000004, "seed": 20260453, "useless_questions_ratio": 0.8571428571428571}
{"agent": "trained_sft_grpo_run2", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 1, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo-run2", "primary_reward": 0.3333333333333333, "questions_used": 7, "reward": 0.2866666666666667, "seed": 20260454, "useless_questions_ratio": 0.8571428571428571}

View File

@@ -0,0 +1,52 @@
[
{
"agent": "keyword_spam",
"avg_evidence_timing": 0,
"avg_primary_reward": 0.0,
"avg_reward": 0.07300000000000001,
"avg_surface_rate": 0.0,
"avg_trigger_rate": 0.6777777777777778,
"avg_useless_ratio": 0.48000000000000004,
"episodes": 30
},
{
"agent": "present_all",
"avg_evidence_timing": 0,
"avg_primary_reward": 0.0,
"avg_reward": 0.0,
"avg_surface_rate": 0.0,
"avg_trigger_rate": 0.0,
"avg_useless_ratio": 0.0,
"episodes": 30
},
{
"agent": "random",
"avg_evidence_timing": 0,
"avg_primary_reward": 0.0,
"avg_reward": 0.0,
"avg_surface_rate": 0.0,
"avg_trigger_rate": 0.0,
"avg_useless_ratio": 1.0,
"episodes": 30
},
{
"agent": "scripted_oracle",
"avg_evidence_timing": 1.9,
"avg_primary_reward": 0.95,
"avg_reward": 0.9023333333333334,
"avg_surface_rate": 0.95,
"avg_trigger_rate": 0.95,
"avg_useless_ratio": 0.0,
"episodes": 30
},
{
"agent": "trained_sft_grpo_run2",
"avg_evidence_timing": 0.8,
"avg_primary_reward": 0.4611111111111111,
"avg_reward": 0.3868888888888889,
"avg_surface_rate": 0.4611111111111111,
"avg_trigger_rate": 0.5888888888888889,
"avg_useless_ratio": 0.8285714285714285,
"episodes": 30
}
]

View File

@@ -0,0 +1,137 @@
# Agent: trained_sft_grpo_run2
## timeline_255d67 (easy)
Esme Dubois is accused of assault at the public library on Walnut Street on May 28th. The victim claims the assault occurred at 11:45 PM. The defendant claims they were elsewhere at that time.
### Transcript
- Step 1 `ask_question`: Esme Dubois is accused of assault at the public library on Walnut Street on May 28th. The victim claims the assault occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time.
- Witness: The assault happened at 11:00 PM, not 11:45 PM.
- Triggered: c1_time_of_assault; Surfaced: none
- Step 2 `present_evidence`: victor_statement
- Witness: Invalid exhibit ID.
- Triggered: c1_time_of_assault; Surfaced: none
- Step 3 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time.
- Witness: The assault happened at 11:00 PM, not 11:45 PM.
- Triggered: c1_time_of_assault; Surfaced: none
- Step 4 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time.
- Witness: You've already asked that question.
- Triggered: c1_time_of_assault; Surfaced: none
- Step 5 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time.
- Witness: You've already asked that question.
- Triggered: c1_time_of_assault; Surfaced: none
- Step 6 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time.
- Witness: You've already asked that question.
- Triggered: c1_time_of_assault; Surfaced: none
- Step 7 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time.
- Witness: You've already asked that question.
- Triggered: c1_time_of_assault; Surfaced: none
- Step 8 `ask_question`: Esme Dubois is accused of theft at the public library on Walnut Street on May 28th. The victim claims the theft occurred at 11:45 PM on the victim's location. The defendant claims the victim was elsewhere at that time.
- Witness: You've already asked that question.
- Triggered: c1_time_of_assault; Surfaced: none
- Step 9 `rest_case`: rest_case
- Witness: [Counsel rests.]
- Triggered: c1_time_of_assault; Surfaced: none
### Reward
```json
{"primary_reward": 0.0, "auxiliary_reward_raw": 0.05000000000000002, "total_reward": 0.010000000000000004, "contradictions_total": 1.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 0.0, "keyword_questions": 2.0, "evidence_timing_successes": 0.0, "blind_evidence_count": 0.0, "duplicate_questions": 5.0, "irrelevant_questions": 0.0, "inadmissible_actions": 1.0, "useless_questions_ratio": 0.7142857142857143, "avg_question_length": 39.0}
```
---
# Agent: trained_sft_grpo_run2
## knowledge_b28f8c (medium)
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
### Transcript
- Step 1 `ask_question`: CASE BRIEF:
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
- Witness: I have never met Kavita Iyer in my life.
- Triggered: c1_know_victim; Surfaced: none
- Step 2 `present_evidence`: email_chain
- Witness: [Witness stammers] I... I'm not sure what to say.
- Triggered: c1_know_victim; Surfaced: c1_know_victim
- Step 3 `ask_question`: CASE BRIEF:
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
- Witness: You've already asked that question.
- Triggered: c1_know_victim; Surfaced: c1_know_victim
- Step 4 `ask_question`: CASE BRIEF:
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
- Witness: You've already asked that question.
- Triggered: c1_know_victim; Surfaced: c1_know_victim
- Step 5 `ask_question`: CASE BRIEF:
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
- Witness: You've already asked that question.
- Triggered: c1_know_victim; Surfaced: c1_know_victim
- Step 6 `ask_question`: CASE BRIEF:
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
- Witness: You've already asked that question.
- Triggered: c1_know_victim; Surfaced: c1_know_victim
- Step 7 `ask_question`: CASE BRIEF:
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
- Witness: You've already asked that question.
- Triggered: c1_know_victim; Surfaced: c1_know_victim
- Step 8 `ask_question`: CASE BRIEF:
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
- Witness: You've already asked that question.
- Triggered: c1_know_victim; Surfaced: c1_know_victim
- Step 9 `rest_case`: rest_case
- Witness: [Counsel rests.]
- Triggered: c1_know_victim; Surfaced: c1_know_victim
### Reward
```json
{"primary_reward": 0.5, "auxiliary_reward_raw": 0.09999999999999998, "total_reward": 0.42000000000000004, "contradictions_total": 2.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 1.0, "keyword_questions": 1.0, "evidence_timing_successes": 1.0, "blind_evidence_count": 0.0, "duplicate_questions": 6.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.8571428571428571, "avg_question_length": 45.0}
```
---
# Agent: trained_sft_grpo_run2
## workplace_c98377 (easy)
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
### Transcript
- Step 1 `ask_question`: CASE BRIEF:
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
- Witness: I never sent Ravi Joshi any threatening message.
- Triggered: c1_message_denial; Surfaced: none
- Step 2 `present_evidence`: message_export
- Witness: [Witness stammers] I... I'm not sure what to say.
- Triggered: c1_message_denial; Surfaced: c1_message_denial
- Step 3 `ask_question`: CASE BRIEF:
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
- Witness: You've already asked that question.
- Triggered: c1_message_denial; Surfaced: c1_message_denial
- Step 4 `ask_question`: CASE BRIEF:
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
- Witness: You've already asked that question.
- Triggered: c1_message_denial; Surfaced: c1_message_denial
- Step 5 `ask_question`: CASE BRIEF:
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
- Witness: You've already asked that question.
- Triggered: c1_message_denial; Surfaced: c1_message_denial
- Step 6 `ask_question`: CASE BRIEF:
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
- Witness: You've already asked that question.
- Triggered: c1_message_denial; Surfaced: c1_message_denial
- Step 7 `ask_question`: CASE BRIEF:
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
- Witness: You've already asked that question.
- Triggered: c1_message_denial; Surfaced: c1_message_denial
- Step 8 `ask_question`: CASE BRIEF:
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
- Witness: You've already asked that question.
- Triggered: c1_message_denial; Surfaced: c1_message_denial
- Step 9 `rest_case`: rest_case
- Witness: [Counsel rests.]
- Triggered: c1_message_denial; Surfaced: c1_message_denial
### Reward
```json
{"primary_reward": 1.0, "auxiliary_reward_raw": 0.09999999999999998, "total_reward": 0.8200000000000001, "contradictions_total": 1.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 1.0, "keyword_questions": 1.0, "evidence_timing_successes": 1.0, "blind_evidence_count": 0.0, "duplicate_questions": 6.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.8571428571428571, "avg_question_length": 31.0}
```