初始化项目,由ModelHub XC社区提供模型
Model: heavycoderhh/counsel-env-qwen3-0.6b-grpo Source: Original Platform
This commit is contained in:
151
eval/trained_eval_rows.csv
Normal file
151
eval/trained_eval_rows.csv
Normal file
@@ -0,0 +1,151 @@
|
||||
agent,seed,case_id,difficulty,reward,primary_reward,auxiliary_reward,contradictions_total,contradictions_triggered,contradictions_surfaced,questions_used,evidence_presented,evidence_timing_successes,blind_evidence_count,useless_questions_ratio,avg_question_length,model_repo,invalid_tool_calls
|
||||
random,20260425,timeline_255d67,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,,
|
||||
random,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
|
||||
random,20260427,workplace_c98377,easy,0.0,0.0,-0.4,1,0,0,3,5,0,5,1.0,5.0,,
|
||||
random,20260428,motive_66ff59,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
|
||||
random,20260429,timeline_19bb78,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,,
|
||||
random,20260430,timeline_a97690,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
|
||||
random,20260431,alibi_67ffcd,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
|
||||
random,20260432,alibi_423bca,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260433,knowledge_960d07,medium,0.0,0.0,-0.4,2,0,0,8,0,0,0,1.0,5.0,,
|
||||
random,20260434,alibi_e829c1,easy,0.0,0.0,-0.4,1,0,0,7,1,0,1,1.0,5.0,,
|
||||
random,20260435,motive_85e25b,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260436,knowledge_a599e3,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
|
||||
random,20260437,motive_8bca20,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,,
|
||||
random,20260438,corporate_6b1664,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
|
||||
random,20260439,alibi_a6c582,easy,0.0,0.0,-0.4,1,0,0,8,0,0,0,1.0,5.0,,
|
||||
random,20260440,workplace_835476,easy,0.0,0.0,-0.4,1,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260441,possession_a079c5,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,,
|
||||
random,20260442,possession_9cc45d,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260443,possession_259aa5,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,,
|
||||
random,20260444,corporate_76724c,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260445,timeline_767821,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260446,motive_c0d166,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260447,corporate_307934,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
|
||||
random,20260448,timeline_592816,hard,0.0,0.0,-0.4,3,0,0,6,2,0,2,1.0,5.0,,
|
||||
random,20260449,knowledge_b26824,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260450,knowledge_697785,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
|
||||
random,20260451,timeline_81dafd,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
|
||||
random,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
|
||||
random,20260453,possession_dbb5fe,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
||||
random,20260454,alibi_a4666f,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,,
|
||||
keyword_spam,20260425,timeline_255d67,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,,
|
||||
keyword_spam,20260426,knowledge_b28f8c,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260427,workplace_c98377,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
|
||||
keyword_spam,20260428,motive_66ff59,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,,
|
||||
keyword_spam,20260429,timeline_19bb78,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,,
|
||||
keyword_spam,20260430,timeline_a97690,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
|
||||
keyword_spam,20260431,alibi_67ffcd,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260432,alibi_423bca,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260433,knowledge_960d07,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260434,alibi_e829c1,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260435,motive_85e25b,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,,
|
||||
keyword_spam,20260436,knowledge_a599e3,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260437,motive_8bca20,easy,0.0,0.0,-0.15000000000000002,1,0,0,5,0,0,0,0.6,4.2,,
|
||||
keyword_spam,20260438,corporate_6b1664,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
|
||||
keyword_spam,20260439,alibi_a6c582,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260440,workplace_835476,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
|
||||
keyword_spam,20260441,possession_a079c5,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,,
|
||||
keyword_spam,20260442,possession_9cc45d,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,,
|
||||
keyword_spam,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
|
||||
keyword_spam,20260444,corporate_76724c,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
|
||||
keyword_spam,20260445,timeline_767821,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
|
||||
keyword_spam,20260446,motive_c0d166,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
|
||||
keyword_spam,20260447,corporate_307934,hard,0.020000000000000007,0.0,0.10000000000000003,3,1,0,5,0,0,0,0.8,4.2,,
|
||||
keyword_spam,20260448,timeline_592816,hard,0.19,0.0,0.95,3,3,0,5,0,0,0,0.2,4.2,,
|
||||
keyword_spam,20260449,knowledge_b26824,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260450,knowledge_697785,hard,0.12000000000000002,0.0,0.6000000000000001,3,2,0,5,0,0,0,0.4,4.2,,
|
||||
keyword_spam,20260451,timeline_81dafd,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
|
||||
keyword_spam,20260452,corporate_8eb7d7,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
|
||||
keyword_spam,20260453,possession_dbb5fe,medium,0.030000000000000006,0.0,0.15000000000000002,2,1,0,5,0,0,0,0.6,4.2,,
|
||||
keyword_spam,20260454,alibi_a4666f,hard,0.15000000000000002,0.0,0.75,3,2,0,5,0,0,0,0.2,4.2,,
|
||||
present_all,20260425,timeline_255d67,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260427,workplace_c98377,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260428,motive_66ff59,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
|
||||
present_all,20260429,timeline_19bb78,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260430,timeline_a97690,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260431,alibi_67ffcd,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260432,alibi_423bca,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260433,knowledge_960d07,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260434,alibi_e829c1,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260435,motive_85e25b,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
|
||||
present_all,20260436,knowledge_a599e3,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260437,motive_8bca20,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260438,corporate_6b1664,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260439,alibi_a6c582,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260440,workplace_835476,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260441,possession_a079c5,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
|
||||
present_all,20260442,possession_9cc45d,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
|
||||
present_all,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260444,corporate_76724c,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260445,timeline_767821,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260446,motive_c0d166,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260447,corporate_307934,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
|
||||
present_all,20260448,timeline_592816,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
|
||||
present_all,20260449,knowledge_b26824,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260450,knowledge_697785,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
|
||||
present_all,20260451,timeline_81dafd,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
||||
present_all,20260453,possession_dbb5fe,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
||||
present_all,20260454,alibi_a4666f,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
|
||||
scripted_oracle,20260425,timeline_255d67,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
||||
scripted_oracle,20260426,knowledge_b28f8c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
|
||||
scripted_oracle,20260427,workplace_c98377,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
||||
scripted_oracle,20260428,motive_66ff59,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
||||
scripted_oracle,20260429,timeline_19bb78,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
||||
scripted_oracle,20260430,timeline_a97690,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
|
||||
scripted_oracle,20260431,alibi_67ffcd,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,,
|
||||
scripted_oracle,20260432,alibi_423bca,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,,
|
||||
scripted_oracle,20260433,knowledge_960d07,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
|
||||
scripted_oracle,20260434,alibi_e829c1,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,,
|
||||
scripted_oracle,20260435,motive_85e25b,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
||||
scripted_oracle,20260436,knowledge_a599e3,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
|
||||
scripted_oracle,20260437,motive_8bca20,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
||||
scripted_oracle,20260438,corporate_6b1664,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
||||
scripted_oracle,20260439,alibi_a6c582,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,,
|
||||
scripted_oracle,20260440,workplace_835476,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
||||
scripted_oracle,20260441,possession_a079c5,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
||||
scripted_oracle,20260442,possession_9cc45d,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
||||
scripted_oracle,20260443,possession_259aa5,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
||||
scripted_oracle,20260444,corporate_76724c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
||||
scripted_oracle,20260445,timeline_767821,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
|
||||
scripted_oracle,20260446,motive_c0d166,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
||||
scripted_oracle,20260447,corporate_307934,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
||||
scripted_oracle,20260448,timeline_592816,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
||||
scripted_oracle,20260449,knowledge_b26824,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
|
||||
scripted_oracle,20260450,knowledge_697785,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.6666666666666667,,
|
||||
scripted_oracle,20260451,timeline_81dafd,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
|
||||
scripted_oracle,20260452,corporate_8eb7d7,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
||||
scripted_oracle,20260453,possession_dbb5fe,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
||||
scripted_oracle,20260454,alibi_a4666f,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,2.3333333333333335,,
|
||||
trained_grpo_200,20260425,timeline_255d67,easy,0.0,0.0,0.0,1,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0
|
||||
trained_grpo_200,20260426,knowledge_b28f8c,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,43.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260427,workplace_c98377,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,29.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260428,motive_66ff59,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,33.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260429,timeline_19bb78,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,35.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260430,timeline_a97690,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,36.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260431,alibi_67ffcd,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260432,alibi_423bca,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260433,knowledge_960d07,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,41.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260434,alibi_e829c1,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,39.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260435,motive_85e25b,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,34.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260436,knowledge_a599e3,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,41.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260437,motive_8bca20,easy,0.0,0.0,0.0,1,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0
|
||||
trained_grpo_200,20260438,corporate_6b1664,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260439,alibi_a6c582,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,38.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260440,workplace_835476,easy,0.0,0.0,0.0,1,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0
|
||||
trained_grpo_200,20260441,possession_a079c5,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,49.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260442,possession_9cc45d,hard,0.0,0.0,0.0,3,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0
|
||||
trained_grpo_200,20260443,possession_259aa5,easy,0.06000000000000001,0.0,0.30000000000000004,1,1,0,1,0,0,0,0.0,44.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260444,corporate_76724c,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,29.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260445,timeline_767821,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,37.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260446,motive_c0d166,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,34.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260447,corporate_307934,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,27.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260448,timeline_592816,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,35.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260449,knowledge_b26824,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,40.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260450,knowledge_697785,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,41.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260451,timeline_81dafd,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,34.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260452,corporate_8eb7d7,medium,0.0,0.0,0.0,2,0,0,0,0,0,0,0.0,0.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,1.0
|
||||
trained_grpo_200,20260453,possession_dbb5fe,medium,0.06000000000000001,0.0,0.30000000000000004,2,1,0,1,0,0,0,0.0,47.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
trained_grpo_200,20260454,alibi_a4666f,hard,0.06000000000000001,0.0,0.30000000000000004,3,1,0,1,0,0,0,0.0,37.0,heavycoderhh/counsel-env-qwen3-0.6b-grpo,0.0
|
||||
|
150
eval/trained_eval_rows.jsonl
Normal file
150
eval/trained_eval_rows.jsonl
Normal file
@@ -0,0 +1,150 @@
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 3, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 1.0}
|
||||
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 1.0}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260425, "useless_questions_ratio": 0.6}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260426, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.8}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260428, "useless_questions_ratio": 0.2}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260429, "useless_questions_ratio": 0.6}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260430, "useless_questions_ratio": 0.2}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260431, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260432, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260433, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260434, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260435, "useless_questions_ratio": 0.2}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260436, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": -0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.6}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260438, "useless_questions_ratio": 0.8}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260439, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.8}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260441, "useless_questions_ratio": 0.6}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260442, "useless_questions_ratio": 0.6}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.8}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260444, "useless_questions_ratio": 0.8}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260445, "useless_questions_ratio": 0.2}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260446, "useless_questions_ratio": 0.2}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260447, "useless_questions_ratio": 0.8}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.95, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.19, "seed": 20260448, "useless_questions_ratio": 0.2}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260449, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260450, "useless_questions_ratio": 0.4}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260451, "useless_questions_ratio": 0.2}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260452, "useless_questions_ratio": 0.8}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260453, "useless_questions_ratio": 0.6}
|
||||
{"agent": "keyword_spam", "auxiliary_reward": 0.75, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.15000000000000002, "seed": 20260454, "useless_questions_ratio": 0.2}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 0.0}
|
||||
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260425, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260426, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260427, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260428, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260429, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_a97690", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260430, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260431, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260432, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260433, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260434, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260435, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260436, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260437, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260438, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260439, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260440, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260441, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260442, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260443, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260444, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_767821", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260445, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260446, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260447, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260448, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260449, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.6666666666666667, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260450, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_81dafd", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260451, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260452, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260453, "useless_questions_ratio": 0.0}
|
||||
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 2.3333333333333335, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260454, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 43.0, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260426, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 29.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260427, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 33.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260428, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 35.0, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260429, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 36.0, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260430, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260431, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260432, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 41.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260433, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 39.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260434, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 34.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260435, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 41.0, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260436, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260438, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 38.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260439, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 49.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260441, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 44.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260443, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 29.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260444, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 37.0, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260445, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 34.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260446, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 27.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260447, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 35.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260448, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 40.0, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260449, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 41.0, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260450, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 34.0, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260451, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.0, "avg_question_length": 0.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 1, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 47.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260453, "useless_questions_ratio": 0.0}
|
||||
{"agent": "trained_grpo_200", "auxiliary_reward": 0.30000000000000004, "avg_question_length": 37.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-0.6b-grpo", "primary_reward": 0.0, "questions_used": 1, "reward": 0.06000000000000001, "seed": 20260454, "useless_questions_ratio": 0.0}
|
||||
52
eval/trained_eval_summary.json
Normal file
52
eval/trained_eval_summary.json
Normal file
@@ -0,0 +1,52 @@
|
||||
[
|
||||
{
|
||||
"agent": "keyword_spam",
|
||||
"avg_evidence_timing": 0,
|
||||
"avg_primary_reward": 0.0,
|
||||
"avg_reward": 0.07300000000000001,
|
||||
"avg_surface_rate": 0.0,
|
||||
"avg_trigger_rate": 0.6777777777777778,
|
||||
"avg_useless_ratio": 0.48000000000000004,
|
||||
"episodes": 30
|
||||
},
|
||||
{
|
||||
"agent": "present_all",
|
||||
"avg_evidence_timing": 0,
|
||||
"avg_primary_reward": 0.0,
|
||||
"avg_reward": 0.0,
|
||||
"avg_surface_rate": 0.0,
|
||||
"avg_trigger_rate": 0.0,
|
||||
"avg_useless_ratio": 0.0,
|
||||
"episodes": 30
|
||||
},
|
||||
{
|
||||
"agent": "random",
|
||||
"avg_evidence_timing": 0,
|
||||
"avg_primary_reward": 0.0,
|
||||
"avg_reward": 0.0,
|
||||
"avg_surface_rate": 0.0,
|
||||
"avg_trigger_rate": 0.0,
|
||||
"avg_useless_ratio": 1.0,
|
||||
"episodes": 30
|
||||
},
|
||||
{
|
||||
"agent": "scripted_oracle",
|
||||
"avg_evidence_timing": 1.9,
|
||||
"avg_primary_reward": 0.95,
|
||||
"avg_reward": 0.9023333333333334,
|
||||
"avg_surface_rate": 0.95,
|
||||
"avg_trigger_rate": 0.95,
|
||||
"avg_useless_ratio": 0.0,
|
||||
"episodes": 30
|
||||
},
|
||||
{
|
||||
"agent": "trained_grpo_200",
|
||||
"avg_evidence_timing": 0,
|
||||
"avg_primary_reward": 0.0,
|
||||
"avg_reward": 0.05000000000000001,
|
||||
"avg_surface_rate": 0.0,
|
||||
"avg_trigger_rate": 0.4611111111111111,
|
||||
"avg_useless_ratio": 0.0,
|
||||
"episodes": 30
|
||||
}
|
||||
]
|
||||
57
eval/trained_eval_transcripts.md
Normal file
57
eval/trained_eval_transcripts.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# Agent: trained_grpo_200
|
||||
|
||||
## timeline_255d67 (easy)
|
||||
|
||||
Esme Dubois is accused of assault at the public library on Walnut Street on May 28th. The victim claims the assault occurred at 11:45 PM. The defendant claims they were elsewhere at that time.
|
||||
|
||||
### Transcript
|
||||
- Step 1 `rest_case`: rest_case
|
||||
- Witness: [Counsel rests.]
|
||||
- Triggered: none; Surfaced: none
|
||||
|
||||
### Reward
|
||||
```json
|
||||
{"primary_reward": 0.0, "auxiliary_reward_raw": 0.0, "total_reward": 0.0, "contradictions_total": 1.0, "contradictions_triggered": 0.0, "contradictions_surfaced": 0.0, "keyword_questions": 0.0, "evidence_timing_successes": 0.0, "blind_evidence_count": 0.0, "duplicate_questions": 0.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.0, "avg_question_length": 0.0}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
# Agent: trained_grpo_200
|
||||
|
||||
## knowledge_b28f8c (medium)
|
||||
|
||||
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
|
||||
|
||||
### Transcript
|
||||
- Step 1 `ask_question`: Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
|
||||
- Witness: I have never met Kavita Iyer in my life.
|
||||
- Triggered: c1_know_victim; Surfaced: none
|
||||
- Step 2 `rest_case`: rest_case
|
||||
- Witness: [Counsel rests.]
|
||||
- Triggered: c1_know_victim; Surfaced: none
|
||||
|
||||
### Reward
|
||||
```json
|
||||
{"primary_reward": 0.0, "auxiliary_reward_raw": 0.30000000000000004, "total_reward": 0.06000000000000001, "contradictions_total": 2.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 0.0, "keyword_questions": 1.0, "evidence_timing_successes": 0.0, "blind_evidence_count": 0.0, "duplicate_questions": 0.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.0, "avg_question_length": 43.0}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
# Agent: trained_grpo_200
|
||||
|
||||
## workplace_c98377 (easy)
|
||||
|
||||
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
|
||||
|
||||
### Transcript
|
||||
- Step 1 `ask_question`: Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
|
||||
- Witness: I never sent Ravi Joshi any threatening message.
|
||||
- Triggered: c1_message_denial; Surfaced: none
|
||||
- Step 2 `rest_case`: rest_case
|
||||
- Witness: [Counsel rests.]
|
||||
- Triggered: c1_message_denial; Surfaced: none
|
||||
|
||||
### Reward
|
||||
```json
|
||||
{"primary_reward": 0.0, "auxiliary_reward_raw": 0.30000000000000004, "total_reward": 0.06000000000000001, "contradictions_total": 1.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 0.0, "keyword_questions": 1.0, "evidence_timing_successes": 0.0, "blind_evidence_count": 0.0, "duplicate_questions": 0.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.0, "avg_question_length": 29.0}
|
||||
```
|
||||
Reference in New Issue
Block a user