[ { "loss": 0.0014, "grad_norm": 50.69855880737305, "learning_rate": 0.0, "num_tokens": 2451.0, "completions/mean_length": 12.75, "completions/min_length": 9.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.028150001540780067, "rewards/env_reward/std": 0.07374775409698486, "rewards/belief_accuracy/mean": 0.24216286838054657, "rewards/belief_accuracy/std": 0.1988486498594284, "reward": 0.7203128337860107, "reward_std": 0.14423373341560364, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.036144825629889965, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00125, "step": 1 }, { "loss": 0.0001, "grad_norm": 4.570699691772461, "learning_rate": 2.5e-06, "num_tokens": 4891.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.28017497062683105, "rewards/env_reward/std": 0.6506500244140625, "rewards/belief_accuracy/mean": 0.10680588334798813, "rewards/belief_accuracy/std": 0.10040652006864548, "reward": 0.2766309082508087, "reward_std": 0.5998044610023499, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.0023617089027538896, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0025, "step": 2 }, { "loss": 0.0016, "grad_norm": 137.97557067871094, "learning_rate": 5e-06, "num_tokens": 7350.0, "completions/mean_length": 14.75, "completions/min_length": 10.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 21.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.41967499256134033, "rewards/env_reward/std": 0.29905566573143005, "rewards/belief_accuracy/mean": 0.3351996839046478, "rewards/belief_accuracy/std": 0.11216925829648972, "reward": 1.2048746347427368, "reward_std": 0.21250373125076294, "frac_reward_zero_std": 0.0, "completion_length": 21.0, "kl": 0.039296648057643324, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00375, "step": 3 }, { "loss": 0.0023, "grad_norm": 31.324565887451172, "learning_rate": 7.5e-06, "num_tokens": 9803.0, "completions/mean_length": 13.25, "completions/min_length": 12.0, "completions/max_length": 17.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.25, "completions/min_terminated_length": 12.0, "completions/max_terminated_length": 17.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.7559999823570251, "rewards/env_reward/std": 1.496000051498413, "rewards/belief_accuracy/mean": 0.05693966522812843, "rewards/belief_accuracy/std": 0.20072676241397858, "reward": -0.5865602493286133, "reward_std": 2.3446295261383057, "frac_reward_zero_std": 0.0, "completion_length": 17.0, "kl": 0.058574457885697484, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005, "step": 4 }, { "loss": 0.0026, "grad_norm": 89.02346801757812, "learning_rate": 1e-05, "num_tokens": 12236.0, "completions/mean_length": 8.25, "completions/min_length": 6.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 8.25, "completions/min_terminated_length": 6.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": -0.5, "rewards/format_valid/std": 1.7320507764816284, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.8660253882408142, "rewards/env_reward/mean": -1.356624960899353, "rewards/env_reward/std": 1.9143962860107422, "rewards/belief_accuracy/mean": 0.015800803899765015, "rewards/belief_accuracy/std": 0.26350587606430054, "reward": -1.5658241510391235, "reward_std": 2.9457480907440186, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.0644997438066639, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00625, "step": 5 }, { "loss": 0.0022, "grad_norm": 106.08165740966797, "learning_rate": 1.25e-05, "num_tokens": 14679.0, "completions/mean_length": 10.75, "completions/min_length": 9.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.7939000129699707, "rewards/env_reward/std": 1.4925646781921387, "rewards/belief_accuracy/mean": 0.16155476868152618, "rewards/belief_accuracy/std": 0.25382331013679504, "reward": -0.519845187664032, "reward_std": 2.4074840545654297, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.053889136761426926, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0075, "step": 6 }, { "loss": 0.0002, "grad_norm": 3.7754714488983154, "learning_rate": 1.5e-05, "num_tokens": 17117.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2715499997138977, "rewards/env_reward/std": 0.10536643117666245, "rewards/belief_accuracy/mean": 0.08886384963989258, "rewards/belief_accuracy/std": 0.13198231160640717, "reward": 0.8104138374328613, "reward_std": 0.08244366198778152, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.006218573806108907, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00875, "step": 7 }, { "loss": 0.0002, "grad_norm": 9.92833137512207, "learning_rate": 1.75e-05, "num_tokens": 19558.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2057500183582306, "rewards/env_reward/std": 1.0211000442504883, "rewards/belief_accuracy/mean": 0.1838883012533188, "rewards/belief_accuracy/std": 0.13176093995571136, "reward": 0.8396382927894592, "reward_std": 0.9627154469490051, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.005196493031689897, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01, "step": 8 }, { "loss": 0.0003, "grad_norm": 6.379985809326172, "learning_rate": 2e-05, "num_tokens": 22017.0, "completions/mean_length": 14.75, "completions/min_length": 10.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.9052000045776367, "rewards/env_reward/std": 1.3965555429458618, "rewards/belief_accuracy/mean": 0.11377047002315521, "rewards/belief_accuracy/std": 0.26322486996650696, "reward": -0.6789295077323914, "reward_std": 2.286468744277954, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.008396895253099501, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01125, "step": 9 }, { "loss": 0.0003, "grad_norm": 8.207470893859863, "learning_rate": 2.25e-05, "num_tokens": 23876.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.20945000648498535, "rewards/env_reward/std": 0.026100002229213715, "rewards/belief_accuracy/mean": 0.3318161964416504, "rewards/belief_accuracy/std": 0.13663487136363983, "reward": 0.5723661780357361, "reward_std": 0.11954429745674133, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.008475569004076533, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0125, "step": 10 }, { "loss": 0.0, "grad_norm": 9.754448890686035, "learning_rate": 2.5e-05, "num_tokens": 26313.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20990000665187836, "rewards/env_reward/std": 0.1363999992609024, "rewards/belief_accuracy/mean": 0.21230295300483704, "rewards/belief_accuracy/std": 0.07961412519216537, "reward": 0.87220299243927, "reward_std": 0.18807174265384674, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.0007791817042743787, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01375, "step": 11 }, { "loss": 0.0016, "grad_norm": 21.532686233520508, "learning_rate": 2.7500000000000004e-05, "num_tokens": 28385.0, "completions/mean_length": 12.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.19249999523162842, "rewards/env_reward/std": 0.16419841349124908, "rewards/belief_accuracy/mean": 0.2661348283290863, "rewards/belief_accuracy/std": 0.16707739233970642, "reward": 0.9086348414421082, "reward_std": 0.2632542550563812, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.03962589804723393, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.015, "step": 12 }, { "loss": 0.0038, "grad_norm": 6.38393497467041, "learning_rate": 3e-05, "num_tokens": 30846.0, "completions/mean_length": 15.25, "completions/min_length": 9.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 21.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.06422500312328339, "rewards/env_reward/std": 0.4030914902687073, "rewards/belief_accuracy/mean": 0.32079488039016724, "rewards/belief_accuracy/std": 0.32852134108543396, "reward": 0.7065698504447937, "reward_std": 0.5689817667007446, "frac_reward_zero_std": 0.0, "completion_length": 21.0, "kl": 0.09559060208266601, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01625, "step": 13 }, { "loss": 0.0024, "grad_norm": 61.281898498535156, "learning_rate": 3.2500000000000004e-05, "num_tokens": 33308.0, "completions/mean_length": 15.5, "completions/min_length": 12.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.5, "completions/min_terminated_length": 12.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": -0.5, "rewards/format_valid/std": 1.7320507764816284, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.8660253882408142, "rewards/env_reward/mean": -2.1558499336242676, "rewards/env_reward/std": 0.9747405052185059, "rewards/belief_accuracy/mean": 0.0016010589897632599, "rewards/belief_accuracy/std": 0.2347448617219925, "reward": -2.379248857498169, "reward_std": 1.9871822595596313, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.061171281384304166, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0175, "step": 14 }, { "loss": 0.0049, "grad_norm": 11.862137794494629, "learning_rate": 3.5e-05, "num_tokens": 35033.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.11425000429153442, "rewards/env_reward/std": 0.29398712515830994, "rewards/belief_accuracy/mean": 0.16708904504776, "rewards/belief_accuracy/std": 0.07635380327701569, "reward": 0.5028390884399414, "reward_std": 0.36507099866867065, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.12338710279436782, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01875, "step": 15 }, { "loss": 0.0095, "grad_norm": 10.376911163330078, "learning_rate": 3.7500000000000003e-05, "num_tokens": 37493.0, "completions/mean_length": 15.0, "completions/min_length": 11.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.0, "completions/min_terminated_length": 11.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.30535000562667847, "rewards/env_reward/std": 0.29750001430511475, "rewards/belief_accuracy/mean": 0.34598004817962646, "rewards/belief_accuracy/std": 0.10989798605442047, "reward": 0.49063006043434143, "reward_std": 0.3236295282840729, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.2386501464061439, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02, "step": 16 }, { "loss": 0.0021, "grad_norm": 5.434854507446289, "learning_rate": 4e-05, "num_tokens": 39931.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.694350004196167, "rewards/env_reward/std": 1.5409032106399536, "rewards/belief_accuracy/mean": 0.029872901737689972, "rewards/belief_accuracy/std": 0.17124304175376892, "reward": -0.5519770383834839, "reward_std": 2.3723113536834717, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.052817441552178934, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02125, "step": 17 }, { "loss": 0.0003, "grad_norm": 5.025228977203369, "learning_rate": 4.25e-05, "num_tokens": 42371.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5460000038146973, "rewards/env_reward/std": 0.2443346381187439, "rewards/belief_accuracy/mean": 0.3628334403038025, "rewards/belief_accuracy/std": 0.06860831379890442, "reward": 1.3588334321975708, "reward_std": 0.29131758213043213, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.007609898108057678, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0225, "step": 18 }, { "loss": 0.0071, "grad_norm": 12.366896629333496, "learning_rate": 4.5e-05, "num_tokens": 44812.0, "completions/mean_length": 10.25, "completions/min_length": 9.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.27274996042251587, "rewards/env_reward/std": 1.072299599647522, "rewards/belief_accuracy/mean": 0.18611717224121094, "rewards/belief_accuracy/std": 0.1490413397550583, "reward": 0.3633672595024109, "reward_std": 1.0452975034713745, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.17784105247119442, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02375, "step": 19 }, { "loss": 0.0241, "grad_norm": 7.6515116691589355, "learning_rate": 4.75e-05, "num_tokens": 47269.0, "completions/mean_length": 14.25, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.08722500503063202, "rewards/env_reward/std": 0.626167356967926, "rewards/belief_accuracy/mean": 0.35667186975479126, "rewards/belief_accuracy/std": 0.14748485386371613, "reward": 0.7194468975067139, "reward_std": 0.6191092133522034, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.6034408716950566, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.025, "step": 20 }, { "loss": 0.0133, "grad_norm": 17.213253021240234, "learning_rate": 5e-05, "num_tokens": 49727.0, "completions/mean_length": 14.5, "completions/min_length": 10.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": -0.5, "rewards/format_valid/std": 1.7320507764816284, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.8660253882408142, "rewards/env_reward/mean": -1.7017250061035156, "rewards/env_reward/std": 1.6725139617919922, "rewards/belief_accuracy/mean": 0.07274026423692703, "rewards/belief_accuracy/std": 0.3264457583427429, "reward": -1.8539845943450928, "reward_std": 2.6750690937042236, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.33139822795055807, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02625, "step": 21 }, { "loss": 0.0269, "grad_norm": 5.38259220123291, "learning_rate": 4.972222222222223e-05, "num_tokens": 52170.0, "completions/mean_length": 10.75, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.08584998548030853, "rewards/env_reward/std": 0.7486812472343445, "rewards/belief_accuracy/mean": 0.23837246000766754, "rewards/belief_accuracy/std": 0.15508003532886505, "reward": 0.6025224924087524, "reward_std": 0.7518510222434998, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.6713134994497523, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0275, "step": 22 }, { "loss": 0.018, "grad_norm": 5.713683605194092, "learning_rate": 4.9444444444444446e-05, "num_tokens": 54618.0, "completions/mean_length": 12.0, "completions/min_length": 9.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.17819999158382416, "rewards/env_reward/std": 0.41936323046684265, "rewards/belief_accuracy/mean": 0.34318745136260986, "rewards/belief_accuracy/std": 0.13306953012943268, "reward": 0.6149874925613403, "reward_std": 0.34224066138267517, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.4507274613715708, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02875, "step": 23 }, { "loss": 0.022, "grad_norm": 233.1907501220703, "learning_rate": 4.9166666666666665e-05, "num_tokens": 57077.0, "completions/mean_length": 14.75, "completions/min_length": 11.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.75, "completions/min_terminated_length": 11.0, "completions/max_terminated_length": 21.0, "rewards/format_valid/mean": -1.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.625, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -2.308774948120117, "rewards/env_reward/std": 1.382449984550476, "rewards/belief_accuracy/mean": -0.052610140293836594, "rewards/belief_accuracy/std": 0.2947797477245331, "reward": -2.9238851070404053, "reward_std": 2.352229595184326, "frac_reward_zero_std": 0.0, "completion_length": 21.0, "kl": 0.5501463627442718, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03, "step": 24 }, { "loss": 0.027, "grad_norm": 7.147825241088867, "learning_rate": 4.888888888888889e-05, "num_tokens": 59515.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.2957499921321869, "rewards/env_reward/std": 0.6496629118919373, "rewards/belief_accuracy/mean": 0.33073002099990845, "rewards/belief_accuracy/std": 0.0814066082239151, "reward": 0.484980046749115, "reward_std": 0.7028239965438843, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6739767706021667, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03125, "step": 25 }, { "loss": 0.0318, "grad_norm": 5.604952812194824, "learning_rate": 4.8611111111111115e-05, "num_tokens": 61952.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.08219999819993973, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.23919677734375, "rewards/belief_accuracy/std": 0.09254845231771469, "reward": 0.6069967746734619, "reward_std": 0.0925484374165535, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7961917221546173, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0325, "step": 26 }, { "loss": 0.0256, "grad_norm": 8.258967399597168, "learning_rate": 4.8333333333333334e-05, "num_tokens": 64408.0, "completions/mean_length": 14.0, "completions/min_length": 10.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.013175025582313538, "rewards/env_reward/std": 0.7235533595085144, "rewards/belief_accuracy/mean": 0.3164691925048828, "rewards/belief_accuracy/std": 0.18967638909816742, "reward": 0.7796441912651062, "reward_std": 0.7857587933540344, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.6395482331572566, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03375, "step": 27 }, { "loss": 0.0298, "grad_norm": 5.232230186462402, "learning_rate": 4.805555555555556e-05, "num_tokens": 66706.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.43720000982284546, "rewards/env_reward/std": 0.28588902950286865, "rewards/belief_accuracy/mean": 0.2731848359107971, "rewards/belief_accuracy/std": 0.09957370162010193, "reward": 1.1603848934173584, "reward_std": 0.23410998284816742, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7461551874876022, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.035, "step": 28 }, { "loss": 0.0112, "grad_norm": 5.738325595855713, "learning_rate": 4.7777777777777784e-05, "num_tokens": 69146.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3465000092983246, "rewards/env_reward/std": 0.2543998062610626, "rewards/belief_accuracy/mean": 0.01810377836227417, "rewards/belief_accuracy/std": 0.052378278225660324, "reward": 0.8146038055419922, "reward_std": 0.2780461311340332, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.28069217689335346, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03625, "step": 29 }, { "loss": 0.0143, "grad_norm": 9.730031967163086, "learning_rate": 4.75e-05, "num_tokens": 71592.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.18525001406669617, "rewards/env_reward/std": 0.8912999629974365, "rewards/belief_accuracy/mean": 0.3416762351989746, "rewards/belief_accuracy/std": 0.08965588361024857, "reward": 0.9769262671470642, "reward_std": 0.8466672897338867, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.35769763961434364, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0375, "step": 30 }, { "loss": 0.0104, "grad_norm": 5.303464412689209, "learning_rate": 4.722222222222222e-05, "num_tokens": 74031.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.299049973487854, "rewards/env_reward/std": 0.04009999334812164, "rewards/belief_accuracy/mean": 0.21054524183273315, "rewards/belief_accuracy/std": 0.09055177122354507, "reward": 0.9595953226089478, "reward_std": 0.08771299570798874, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2592179449275136, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03875, "step": 31 }, { "loss": 0.0194, "grad_norm": 11.62190055847168, "learning_rate": 4.6944444444444446e-05, "num_tokens": 76469.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.46445000171661377, "rewards/env_reward/std": 0.5925008654594421, "rewards/belief_accuracy/mean": 0.05113711208105087, "rewards/belief_accuracy/std": 0.0761847198009491, "reward": 0.9655871391296387, "reward_std": 0.5988060832023621, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.48394265957176685, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04, "step": 32 }, { "loss": 0.0115, "grad_norm": 4.997323036193848, "learning_rate": 4.666666666666667e-05, "num_tokens": 78908.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1738000065088272, "rewards/env_reward/std": 0.15648801624774933, "rewards/belief_accuracy/mean": 0.23964327573776245, "rewards/belief_accuracy/std": 0.16614758968353271, "reward": 0.8634432554244995, "reward_std": 0.2403155416250229, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.28875103616155684, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04125, "step": 33 }, { "loss": 0.026, "grad_norm": 7.761037826538086, "learning_rate": 4.638888888888889e-05, "num_tokens": 80634.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6396250128746033, "rewards/env_reward/std": 0.5599880218505859, "rewards/belief_accuracy/mean": 0.18821844458580017, "rewards/belief_accuracy/std": 0.017153441905975342, "reward": 1.2778434753417969, "reward_std": 0.5499616265296936, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6490175630897284, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0425, "step": 34 }, { "loss": 0.0338, "grad_norm": 6.997340202331543, "learning_rate": 4.6111111111111115e-05, "num_tokens": 83071.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4887250065803528, "rewards/env_reward/std": 0.24987412989139557, "rewards/belief_accuracy/mean": 0.09596741199493408, "rewards/belief_accuracy/std": 0.11219175159931183, "reward": 1.034692406654358, "reward_std": 0.3406420350074768, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.8445844687521458, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04375, "step": 35 }, { "loss": 0.0318, "grad_norm": 4.503076553344727, "learning_rate": 4.5833333333333334e-05, "num_tokens": 85527.0, "completions/mean_length": 14.0, "completions/min_length": 11.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 11.0, "completions/max_terminated_length": 21.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.6953500509262085, "rewards/env_reward/std": 0.7904999852180481, "rewards/belief_accuracy/mean": 0.27033090591430664, "rewards/belief_accuracy/std": 0.22744685411453247, "reward": 0.024980902671813965, "reward_std": 0.5933173894882202, "frac_reward_zero_std": 0.0, "completion_length": 21.0, "kl": 0.7940250784158707, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.045, "step": 36 }, { "loss": 0.0309, "grad_norm": 2.819326877593994, "learning_rate": 4.555555555555556e-05, "num_tokens": 87957.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.23907500505447388, "rewards/env_reward/std": 0.026238946244120598, "rewards/belief_accuracy/mean": 0.28547099232673645, "rewards/belief_accuracy/std": 0.0, "reward": 0.9745460152626038, "reward_std": 0.026238948106765747, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7724389061331749, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04625, "step": 37 }, { "loss": 0.0035, "grad_norm": 3.8090322017669678, "learning_rate": 4.527777777777778e-05, "num_tokens": 90397.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03530000150203705, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.336172491312027, "rewards/belief_accuracy/std": 0.10255803912878036, "reward": 0.8214725255966187, "reward_std": 0.10255800932645798, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.08815138379577547, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0475, "step": 38 }, { "loss": 0.0228, "grad_norm": 5.734916687011719, "learning_rate": 4.5e-05, "num_tokens": 92640.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.0450499951839447, "rewards/env_reward/std": 0.1559000015258789, "rewards/belief_accuracy/mean": 0.31915175914764404, "rewards/belief_accuracy/std": 0.09154129028320312, "reward": 0.8142017722129822, "reward_std": 0.11311425268650055, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5696801505982876, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04875, "step": 39 }, { "loss": 0.0337, "grad_norm": 90.21479034423828, "learning_rate": 4.472222222222223e-05, "num_tokens": 95096.0, "completions/mean_length": 14.0, "completions/min_length": 11.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 11.0, "completions/max_terminated_length": 21.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.25302499532699585, "rewards/env_reward/std": 0.014749996364116669, "rewards/belief_accuracy/mean": 0.3270469307899475, "rewards/belief_accuracy/std": 0.11481481790542603, "reward": 0.5240219831466675, "reward_std": 0.11077450960874557, "frac_reward_zero_std": 0.0, "completion_length": 21.0, "kl": 0.8432229831814766, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05, "step": 40 }, { "loss": 0.0248, "grad_norm": 6.006437301635742, "learning_rate": 4.4444444444444447e-05, "num_tokens": 97206.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.753250002861023, "rewards/env_reward/std": 0.12153223156929016, "rewards/belief_accuracy/mean": 0.2870370149612427, "rewards/belief_accuracy/std": 0.11331170797348022, "reward": 1.490286946296692, "reward_std": 0.1738055795431137, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6199327223002911, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05125, "step": 41 }, { "loss": 0.0388, "grad_norm": 3.4820449352264404, "learning_rate": 4.4166666666666665e-05, "num_tokens": 99661.0, "completions/mean_length": 13.75, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.09109999984502792, "rewards/env_reward/std": 0.4848000109195709, "rewards/belief_accuracy/mean": 0.1046341061592102, "rewards/belief_accuracy/std": 0.3321184515953064, "reward": 0.6457340717315674, "reward_std": 0.43696078658103943, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.9688376598060131, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0525, "step": 42 }, { "loss": 0.0204, "grad_norm": 8.73534870147705, "learning_rate": 4.388888888888889e-05, "num_tokens": 102100.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5957750082015991, "rewards/env_reward/std": 0.364950031042099, "rewards/belief_accuracy/mean": 0.3190256357192993, "rewards/belief_accuracy/std": 0.10691672563552856, "reward": 1.3648006916046143, "reward_std": 0.39195773005485535, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5111057488247752, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05375, "step": 43 }, { "loss": 0.0233, "grad_norm": 9.246126174926758, "learning_rate": 4.3611111111111116e-05, "num_tokens": 104545.0, "completions/mean_length": 11.25, "completions/min_length": 11.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.25, "completions/min_terminated_length": 11.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.02084999904036522, "rewards/env_reward/std": 0.25270000100135803, "rewards/belief_accuracy/mean": -0.029667485505342484, "rewards/belief_accuracy/std": 0.035023655742406845, "reward": 0.44118252396583557, "reward_std": 0.2176763415336609, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.5820450782775879, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.055, "step": 44 }, { "loss": 0.0268, "grad_norm": 9.163407325744629, "learning_rate": 4.3333333333333334e-05, "num_tokens": 106982.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.15120001137256622, "rewards/env_reward/std": 0.33800002932548523, "rewards/belief_accuracy/mean": 0.2724537253379822, "rewards/belief_accuracy/std": 0.03546027094125748, "reward": 0.571253776550293, "reward_std": 0.33365941047668457, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6694209240376949, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05625, "step": 45 }, { "loss": 0.0449, "grad_norm": 11.127840042114258, "learning_rate": 4.305555555555556e-05, "num_tokens": 109433.0, "completions/mean_length": 12.75, "completions/min_length": 8.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.75, "completions/min_terminated_length": 8.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.23845000565052032, "rewards/env_reward/std": 0.7578877210617065, "rewards/belief_accuracy/mean": 0.3392890393733978, "rewards/belief_accuracy/std": 0.12074092775583267, "reward": 1.0277390480041504, "reward_std": 0.8693292737007141, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 1.121412256732583, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0575, "step": 46 }, { "loss": 0.027, "grad_norm": 11.611600875854492, "learning_rate": 4.277777777777778e-05, "num_tokens": 111875.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.8055999875068665, "rewards/env_reward/std": 0.8989343643188477, "rewards/belief_accuracy/mean": 0.17277316749095917, "rewards/belief_accuracy/std": 0.20537562668323517, "reward": 1.4283732175827026, "reward_std": 0.8459944128990173, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.673852413892746, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05875, "step": 47 }, { "loss": 0.0149, "grad_norm": 5.983867168426514, "learning_rate": 4.25e-05, "num_tokens": 114347.0, "completions/mean_length": 18.0, "completions/min_length": 12.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.0, "completions/min_terminated_length": 12.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.19099999964237213, "rewards/env_reward/std": 0.4099873900413513, "rewards/belief_accuracy/mean": 0.1646607667207718, "rewards/belief_accuracy/std": 0.1431625336408615, "reward": 0.4236607849597931, "reward_std": 0.28222158551216125, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.37127236742526293, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06, "step": 48 }, { "loss": 0.0932, "grad_norm": 12.0399808883667, "learning_rate": 4.222222222222222e-05, "num_tokens": 116785.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.09814999997615814, "rewards/env_reward/std": 0.13192453980445862, "rewards/belief_accuracy/mean": 0.09809152781963348, "rewards/belief_accuracy/std": 0.055555559694767, "reward": 0.4499415159225464, "reward_std": 0.15268102288246155, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 2.329849496483803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06125, "step": 49 }, { "loss": 0.0054, "grad_norm": 5.024685382843018, "learning_rate": 4.194444444444445e-05, "num_tokens": 119221.0, "completions/mean_length": 9.0, "completions/min_length": 9.0, "completions/max_length": 9.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 9.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.23090000450611115, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.14893732964992523, "rewards/belief_accuracy/std": 0.0555555559694767, "reward": 0.3680373430252075, "reward_std": 0.0555555485188961, "frac_reward_zero_std": 0.0, "completion_length": 9.0, "kl": 0.13551567122340202, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0625, "step": 50 }, { "loss": 0.026, "grad_norm": 7.126793384552002, "learning_rate": 4.166666666666667e-05, "num_tokens": 121659.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.013075001537799835, "rewards/env_reward/std": 0.06034718081355095, "rewards/belief_accuracy/mean": 0.3297470211982727, "rewards/belief_accuracy/std": 0.0708894208073616, "reward": 0.792822003364563, "reward_std": 0.08787091821432114, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6491426480934024, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06375, "step": 51 }, { "loss": 0.0217, "grad_norm": 12.94462776184082, "learning_rate": 4.138888888888889e-05, "num_tokens": 124097.0, "completions/mean_length": 9.5, "completions/min_length": 8.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 8.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.3197999894618988, "rewards/env_reward/std": 1.7868000268936157, "rewards/belief_accuracy/mean": 0.11957336962223053, "rewards/belief_accuracy/std": 0.2547529339790344, "reward": -0.08772659301757812, "reward_std": 2.6784932613372803, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5436751991510391, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.065, "step": 52 }, { "loss": 0.056, "grad_norm": 12.370966911315918, "learning_rate": 4.111111111111111e-05, "num_tokens": 126534.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2619749903678894, "rewards/env_reward/std": 0.17835001647472382, "rewards/belief_accuracy/mean": 0.19994337856769562, "rewards/belief_accuracy/std": 0.052378278225660324, "reward": 0.9119184017181396, "reward_std": 0.18588221073150635, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.4008168280124664, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06625, "step": 53 }, { "loss": 0.0071, "grad_norm": 4.3133320808410645, "learning_rate": 4.0833333333333334e-05, "num_tokens": 128642.0, "completions/mean_length": 9.0, "completions/min_length": 9.0, "completions/max_length": 9.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 9.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.31644999980926514, "rewards/env_reward/std": 0.17150001227855682, "rewards/belief_accuracy/mean": 0.3312014043331146, "rewards/belief_accuracy/std": 0.038512155413627625, "reward": 1.097651481628418, "reward_std": 0.16205953061580658, "frac_reward_zero_std": 0.0, "completion_length": 9.0, "kl": 0.1781236482784152, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0675, "step": 54 }, { "loss": 0.0357, "grad_norm": 34.0543098449707, "learning_rate": 4.055555555555556e-05, "num_tokens": 131109.0, "completions/mean_length": 16.75, "completions/min_length": 12.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.75, "completions/min_terminated_length": 12.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.9136250019073486, "rewards/env_reward/std": 1.4743117094039917, "rewards/belief_accuracy/mean": 0.2600095570087433, "rewards/belief_accuracy/std": 0.3070025146007538, "reward": -0.541115403175354, "reward_std": 2.423954486846924, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.8935712603852153, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06875, "step": 55 }, { "loss": 0.0153, "grad_norm": 8.462577819824219, "learning_rate": 4.027777777777778e-05, "num_tokens": 133548.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.055775001645088196, "rewards/env_reward/std": 0.4801793694496155, "rewards/belief_accuracy/mean": 0.3351301848888397, "rewards/belief_accuracy/std": 0.03546025604009628, "reward": 0.8409051895141602, "reward_std": 0.5089951157569885, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3837334793061018, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07, "step": 56 }, { "loss": 0.0161, "grad_norm": 6.602283477783203, "learning_rate": 4e-05, "num_tokens": 135987.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2533249855041504, "rewards/env_reward/std": 0.19902949035167694, "rewards/belief_accuracy/mean": 0.3198787569999695, "rewards/belief_accuracy/std": 0.06048122048377991, "reward": 1.0232038497924805, "reward_std": 0.17945493757724762, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4018232896924019, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07125, "step": 57 }, { "loss": 0.0606, "grad_norm": 11.349449157714844, "learning_rate": 3.972222222222222e-05, "num_tokens": 138443.0, "completions/mean_length": 14.0, "completions/min_length": 13.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 13.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.17509999871253967, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4381297826766968, "rewards/belief_accuracy/std": 0.01180770993232727, "reward": 0.7130297422409058, "reward_std": 0.011807739734649658, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 1.514005783945322, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0725, "step": 58 }, { "loss": 0.0119, "grad_norm": 3.675447702407837, "learning_rate": 3.944444444444445e-05, "num_tokens": 140371.0, "completions/mean_length": 9.0, "completions/min_length": 9.0, "completions/max_length": 9.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 9.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.058400001376867294, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2348596453666687, "rewards/belief_accuracy/std": 0.08212428539991379, "reward": 0.7432596683502197, "reward_std": 0.08212430030107498, "frac_reward_zero_std": 0.0, "completion_length": 9.0, "kl": 0.2982510030269623, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07375, "step": 59 }, { "loss": 0.0312, "grad_norm": 9.70500659942627, "learning_rate": 3.9166666666666665e-05, "num_tokens": 142813.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.012999998405575752, "rewards/env_reward/std": 0.165887251496315, "rewards/belief_accuracy/mean": 0.3499753177165985, "rewards/belief_accuracy/std": 0.02043679915368557, "reward": 0.7869753837585449, "reward_std": 0.1802431344985962, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.7789931371808052, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.075, "step": 60 }, { "loss": 0.0199, "grad_norm": 9.680730819702148, "learning_rate": 3.888888888888889e-05, "num_tokens": 145285.0, "completions/mean_length": 18.0, "completions/min_length": 15.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.0, "completions/min_terminated_length": 15.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.44760000705718994, "rewards/env_reward/std": 0.31296268105506897, "rewards/belief_accuracy/mean": 0.48169824481010437, "rewards/belief_accuracy/std": 0.02147861011326313, "reward": 1.379298210144043, "reward_std": 0.29271385073661804, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.49687472730875015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07625, "step": 61 }, { "loss": 0.0277, "grad_norm": 9.184836387634277, "learning_rate": 3.8611111111111116e-05, "num_tokens": 147583.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1648000031709671, "rewards/env_reward/std": 0.29756635427474976, "rewards/belief_accuracy/mean": 0.3408702611923218, "rewards/belief_accuracy/std": 0.10047954320907593, "reward": 0.9556702375411987, "reward_std": 0.3963484764099121, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6918958639726043, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0775, "step": 62 }, { "loss": 0.0278, "grad_norm": 9.352582931518555, "learning_rate": 3.8333333333333334e-05, "num_tokens": 150037.0, "completions/mean_length": 13.5, "completions/min_length": 9.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.4378499984741211, "rewards/env_reward/std": 1.753225564956665, "rewards/belief_accuracy/mean": 0.22078636288642883, "rewards/belief_accuracy/std": 0.28245842456817627, "reward": -0.10456359386444092, "reward_std": 2.6881396770477295, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 0.6942682042717934, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07875, "step": 63 }, { "loss": 0.0056, "grad_norm": 5.730816841125488, "learning_rate": 3.805555555555555e-05, "num_tokens": 152477.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03889999911189079, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3603585958480835, "rewards/belief_accuracy/std": 0.02581331506371498, "reward": 0.8492586016654968, "reward_std": 0.025813313201069832, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.1406126618385315, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08, "step": 64 }, { "loss": 0.0318, "grad_norm": 12.032550811767578, "learning_rate": 3.777777777777778e-05, "num_tokens": 154916.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.625, "rewards/env_reward/std": 1.5835769176483154, "rewards/belief_accuracy/mean": 0.1766444593667984, "rewards/belief_accuracy/std": 0.251135915517807, "reward": -0.33585548400878906, "reward_std": 2.5095832347869873, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7948775328695774, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08125, "step": 65 }, { "loss": 0.0085, "grad_norm": 7.913240432739258, "learning_rate": 3.7500000000000003e-05, "num_tokens": 157354.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.79830002784729, "rewards/env_reward/std": 0.4454835057258606, "rewards/belief_accuracy/mean": 0.38048359751701355, "rewards/belief_accuracy/std": 0.046978533267974854, "reward": 1.6287837028503418, "reward_std": 0.42615801095962524, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2136247158050537, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0825, "step": 66 }, { "loss": 0.0042, "grad_norm": 1.553076148033142, "learning_rate": 3.722222222222222e-05, "num_tokens": 159830.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3972000181674957, "rewards/env_reward/std": 0.3020000159740448, "rewards/belief_accuracy/mean": 0.4757356643676758, "rewards/belief_accuracy/std": 0.043959349393844604, "reward": 1.322935700416565, "reward_std": 0.2903204560279846, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.10467652417719364, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08375, "step": 67 }, { "loss": 0.0652, "grad_norm": 6.090060710906982, "learning_rate": 3.694444444444445e-05, "num_tokens": 162269.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.018775001168251038, "rewards/env_reward/std": 0.4861546754837036, "rewards/belief_accuracy/mean": 0.31664538383483887, "rewards/belief_accuracy/std": 0.0875483974814415, "reward": 0.7854204177856445, "reward_std": 0.4746541976928711, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.6307630129158497, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.085, "step": 68 }, { "loss": 0.0096, "grad_norm": 3.485288381576538, "learning_rate": 3.6666666666666666e-05, "num_tokens": 164709.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5713000297546387, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.36959928274154663, "rewards/belief_accuracy/std": 0.02012895792722702, "reward": 1.390899419784546, "reward_std": 0.020128881558775902, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.23944057151675224, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08625, "step": 69 }, { "loss": 0.0732, "grad_norm": 16.18677520751953, "learning_rate": 3.638888888888889e-05, "num_tokens": 167148.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16820000112056732, "rewards/env_reward/std": 0.28401991724967957, "rewards/belief_accuracy/mean": 0.3244222104549408, "rewards/belief_accuracy/std": 0.043074190616607666, "reward": 0.942622184753418, "reward_std": 0.25368720293045044, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.8300514370203018, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0875, "step": 70 }, { "loss": 0.0468, "grad_norm": 7.618915557861328, "learning_rate": 3.611111111111111e-05, "num_tokens": 169586.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.016399994492530823, "rewards/env_reward/std": 0.3711565434932709, "rewards/belief_accuracy/mean": 0.3871665596961975, "rewards/belief_accuracy/std": 0.05502847209572792, "reward": 0.8535665273666382, "reward_std": 0.3577015995979309, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.169862363487482, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08875, "step": 71 }, { "loss": 0.0272, "grad_norm": 17.216642379760742, "learning_rate": 3.5833333333333335e-05, "num_tokens": 171696.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3703000247478485, "rewards/env_reward/std": 0.4340519607067108, "rewards/belief_accuracy/mean": 0.20865774154663086, "rewards/belief_accuracy/std": 0.03286440670490265, "reward": 1.028957724571228, "reward_std": 0.41150763630867004, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6808963306248188, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09, "step": 72 }, { "loss": 0.008, "grad_norm": 14.145647048950195, "learning_rate": 3.555555555555556e-05, "num_tokens": 174176.0, "completions/mean_length": 20.0, "completions/min_length": 19.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 20.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 21.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -1.2345499992370605, "rewards/env_reward/std": 1.2276198863983154, "rewards/belief_accuracy/mean": 0.31225794553756714, "rewards/belief_accuracy/std": 0.21262100338935852, "reward": -0.4722920060157776, "reward_std": 1.0149987936019897, "frac_reward_zero_std": 0.0, "completion_length": 21.0, "kl": 0.20024005696177483, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09125, "step": 73 }, { "loss": 0.0093, "grad_norm": 8.049619674682617, "learning_rate": 3.527777777777778e-05, "num_tokens": 176616.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3262999951839447, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2448773980140686, "rewards/belief_accuracy/std": 0.14699672162532806, "reward": 1.0211774110794067, "reward_std": 0.14699670672416687, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2335779219865799, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0925, "step": 74 }, { "loss": 0.0063, "grad_norm": 3.8269565105438232, "learning_rate": 3.5e-05, "num_tokens": 179093.0, "completions/mean_length": 19.25, "completions/min_length": 19.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.25, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.314300000667572, "rewards/env_reward/std": 0.020400002598762512, "rewards/belief_accuracy/mean": 0.1281227469444275, "rewards/belief_accuracy/std": 0.0, "reward": 0.2638227641582489, "reward_std": 0.020399997010827065, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.15778795257210732, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09375, "step": 75 }, { "loss": 0.0156, "grad_norm": 7.611948490142822, "learning_rate": 3.472222222222222e-05, "num_tokens": 181568.0, "completions/mean_length": 18.75, "completions/min_length": 17.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.75, "completions/min_terminated_length": 17.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3963499963283539, "rewards/env_reward/std": 0.6052363514900208, "rewards/belief_accuracy/mean": 0.165622740983963, "rewards/belief_accuracy/std": 0.07500001043081284, "reward": 1.0119727849960327, "reward_std": 0.6514222621917725, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.389703668653965, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.095, "step": 76 }, { "loss": 0.0065, "grad_norm": 0.13189589977264404, "learning_rate": 3.444444444444445e-05, "num_tokens": 184008.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.708899974822998, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4184635281562805, "rewards/belief_accuracy/std": 0.0, "reward": 1.5773634910583496, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.16265837848186493, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09625, "step": 77 }, { "loss": 0.0114, "grad_norm": 6.107391834259033, "learning_rate": 3.4166666666666666e-05, "num_tokens": 186448.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.38842499256134033, "rewards/env_reward/std": 0.06755000352859497, "rewards/belief_accuracy/mean": 0.24584877490997314, "rewards/belief_accuracy/std": 0.06684443354606628, "reward": 1.0842738151550293, "reward_std": 0.12935107946395874, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2844863813370466, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0975, "step": 78 }, { "loss": 0.0256, "grad_norm": 8.824180603027344, "learning_rate": 3.388888888888889e-05, "num_tokens": 188887.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4518749713897705, "rewards/env_reward/std": 0.4644499719142914, "rewards/belief_accuracy/mean": 0.19068412482738495, "rewards/belief_accuracy/std": 0.03546026349067688, "reward": 1.0925590991973877, "reward_std": 0.483914315700531, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6403497904539108, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09875, "step": 79 }, { "loss": 0.0113, "grad_norm": 3.3538818359375, "learning_rate": 3.3611111111111116e-05, "num_tokens": 191327.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.18469999730587006, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3592345714569092, "rewards/belief_accuracy/std": 0.060644667595624924, "reward": 0.9939346313476562, "reward_std": 0.06064464524388313, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2827349714934826, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1, "step": 80 }, { "loss": 0.0897, "grad_norm": 23.832109451293945, "learning_rate": 3.3333333333333335e-05, "num_tokens": 193803.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1657000035047531, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.25543123483657837, "rewards/belief_accuracy/std": 0.09817582368850708, "reward": 0.8711313009262085, "reward_std": 0.09817580133676529, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 2.242331273853779, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10125, "step": 81 }, { "loss": 0.014, "grad_norm": 3.796640634536743, "learning_rate": 3.3055555555555553e-05, "num_tokens": 196243.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1678999960422516, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3403550088405609, "rewards/belief_accuracy/std": 0.03546027094125748, "reward": 0.9582550525665283, "reward_std": 0.03546029329299927, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.35036180168390274, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1025, "step": 82 }, { "loss": 0.0288, "grad_norm": 10.267955780029297, "learning_rate": 3.277777777777778e-05, "num_tokens": 198702.0, "completions/mean_length": 14.75, "completions/min_length": 12.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.75, "completions/min_terminated_length": 12.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.32109999656677246, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2837349474430084, "rewards/belief_accuracy/std": 0.16117514669895172, "reward": 1.0548349618911743, "reward_std": 0.16117516160011292, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.7201052233576775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10375, "step": 83 }, { "loss": 0.0066, "grad_norm": 4.409769535064697, "learning_rate": 3.2500000000000004e-05, "num_tokens": 201178.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.11659999936819077, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2675629258155823, "rewards/belief_accuracy/std": 0.024750780314207077, "reward": 0.8341628909111023, "reward_std": 0.02475076913833618, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.16481813788414001, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.105, "step": 84 }, { "loss": 0.01, "grad_norm": 2.341743230819702, "learning_rate": 3.222222222222223e-05, "num_tokens": 203618.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03889999911189079, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43518519401550293, "rewards/belief_accuracy/std": 0.029672183096408844, "reward": 0.9240851402282715, "reward_std": 0.02967216819524765, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.24941369146108627, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10625, "step": 85 }, { "loss": 0.0133, "grad_norm": 4.166858196258545, "learning_rate": 3.194444444444444e-05, "num_tokens": 206058.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.19509999454021454, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3840411305427551, "rewards/belief_accuracy/std": 0.037037044763565063, "reward": 1.0291411876678467, "reward_std": 0.037037014961242676, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.33342748135328293, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1075, "step": 86 }, { "loss": 0.0072, "grad_norm": 10.752022743225098, "learning_rate": 3.1666666666666666e-05, "num_tokens": 208525.0, "completions/mean_length": 16.75, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.1086000204086304, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.26387926936149597, "rewards/belief_accuracy/std": 0.16067418456077576, "reward": 1.822479248046875, "reward_std": 0.16067416965961456, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.17901013372465968, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10875, "step": 87 }, { "loss": 0.0128, "grad_norm": 4.2524333000183105, "learning_rate": 3.138888888888889e-05, "num_tokens": 210965.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6363000273704529, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.23227064311504364, "rewards/belief_accuracy/std": 0.04781460389494896, "reward": 1.3185707330703735, "reward_std": 0.04781458154320717, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.32108578458428383, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11, "step": 88 }, { "loss": 0.0201, "grad_norm": 3.387347936630249, "learning_rate": 3.111111111111111e-05, "num_tokens": 213029.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.29019999504089355, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.23877549171447754, "rewards/belief_accuracy/std": 0.08584360778331757, "reward": 0.9789755344390869, "reward_std": 0.08584359288215637, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5034720227122307, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11125, "step": 89 }, { "loss": 0.0105, "grad_norm": 0.03340158984065056, "learning_rate": 3.0833333333333335e-05, "num_tokens": 215469.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5357999801635742, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3786620497703552, "rewards/belief_accuracy/std": 0.0, "reward": 1.36446213722229, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.26204439997673035, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1125, "step": 90 }, { "loss": 0.0099, "grad_norm": 2.6622090339660645, "learning_rate": 3.055555555555556e-05, "num_tokens": 217909.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6814000010490417, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.22560039162635803, "rewards/belief_accuracy/std": 0.03703703731298447, "reward": 1.3570003509521484, "reward_std": 0.037037014961242676, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.24875067919492722, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11375, "step": 91 }, { "loss": 0.0531, "grad_norm": 4.632673263549805, "learning_rate": 3.0277777777777776e-05, "num_tokens": 220348.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.030675001442432404, "rewards/env_reward/std": 0.03785000368952751, "rewards/belief_accuracy/mean": 0.4066362977027893, "rewards/belief_accuracy/std": 0.03470852971076965, "reward": 0.8873113393783569, "reward_std": 0.0031414825934916735, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.328475397080183, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.115, "step": 92 }, { "loss": 0.0128, "grad_norm": 4.490170001983643, "learning_rate": 3e-05, "num_tokens": 222788.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.10649999976158142, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4476425051689148, "rewards/belief_accuracy/std": 0.022659141570329666, "reward": 1.0041425228118896, "reward_std": 0.02265910431742668, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.32097524777054787, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11625, "step": 93 }, { "loss": 0.0113, "grad_norm": 4.904633045196533, "learning_rate": 2.9722222222222223e-05, "num_tokens": 224648.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.467074990272522, "rewards/env_reward/std": 0.5573499798774719, "rewards/belief_accuracy/mean": 0.442878395318985, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.3599534034729004, "reward_std": 0.5514536499977112, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.28252891823649406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1175, "step": 94 }, { "loss": 0.0106, "grad_norm": 3.4134881496429443, "learning_rate": 2.9444444444444448e-05, "num_tokens": 227127.0, "completions/mean_length": 19.75, "completions/min_length": 19.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.75, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16647499799728394, "rewards/env_reward/std": 0.2721500098705292, "rewards/belief_accuracy/mean": 0.34791865944862366, "rewards/belief_accuracy/std": 0.017089322209358215, "reward": 0.9643936157226562, "reward_std": 0.25506067276000977, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.26468512788414955, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11875, "step": 95 }, { "loss": 0.0126, "grad_norm": 0.03512604907155037, "learning_rate": 2.916666666666667e-05, "num_tokens": 229567.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.37619999051094055, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.42342302203178406, "rewards/belief_accuracy/std": 0.0, "reward": 1.2496230602264404, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3145686089992523, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12, "step": 96 }, { "loss": 0.0097, "grad_norm": 3.007344961166382, "learning_rate": 2.8888888888888888e-05, "num_tokens": 232007.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04450000077486038, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4202476143836975, "rewards/belief_accuracy/std": 0.013101109303534031, "reward": 0.9147475957870483, "reward_std": 0.013101109303534031, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2418195605278015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12125, "step": 97 }, { "loss": 0.0109, "grad_norm": 0.023203933611512184, "learning_rate": 2.861111111111111e-05, "num_tokens": 234447.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6881999969482422, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3073524236679077, "rewards/belief_accuracy/std": 0.0, "reward": 1.4455524682998657, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.27170511707663536, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1225, "step": 98 }, { "loss": 0.004, "grad_norm": 0.017642363905906677, "learning_rate": 2.8333333333333335e-05, "num_tokens": 236923.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7996000051498413, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.433454692363739, "rewards/belief_accuracy/std": 0.0, "reward": 1.683054804801941, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 19.0, "kl": 0.09895136207342148, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12375, "step": 99 }, { "loss": 0.0167, "grad_norm": 0.03634629398584366, "learning_rate": 2.8055555555555557e-05, "num_tokens": 239363.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.021700000390410423, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43159350752830505, "rewards/belief_accuracy/std": 0.0, "reward": 0.9032934904098511, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.417863130569458, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.125, "step": 100 }, { "loss": 0.0746, "grad_norm": 168.017333984375, "learning_rate": 2.777777777777778e-05, "num_tokens": 241829.0, "completions/mean_length": 16.5, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3156999945640564, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2862425148487091, "rewards/belief_accuracy/std": 0.14819316565990448, "reward": 1.0519425868988037, "reward_std": 0.14819319546222687, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 1.8653298392891884, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12625, "step": 101 }, { "loss": 0.0201, "grad_norm": 3.334230661392212, "learning_rate": 2.7500000000000004e-05, "num_tokens": 244269.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.08259999752044678, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2945099174976349, "rewards/belief_accuracy/std": 0.034212201833724976, "reward": 0.8271099328994751, "reward_std": 0.03421219810843468, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5032302476465702, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1275, "step": 102 }, { "loss": 0.0131, "grad_norm": 4.814326286315918, "learning_rate": 2.7222222222222223e-05, "num_tokens": 246709.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3188999891281128, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35364869236946106, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.1225488185882568, "reward_std": 0.01851852796971798, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3283202312886715, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12875, "step": 103 }, { "loss": 0.0082, "grad_norm": 7.209584712982178, "learning_rate": 2.6944444444444445e-05, "num_tokens": 249167.0, "completions/mean_length": 14.5, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.37380000948905945, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.46567827463150024, "rewards/belief_accuracy/std": 0.008995355106890202, "reward": 1.2894783020019531, "reward_std": 0.008995347656309605, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.2050139382481575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13, "step": 104 }, { "loss": 0.0286, "grad_norm": 0.03365113213658333, "learning_rate": 2.6666666666666667e-05, "num_tokens": 251607.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.41749998927116394, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.16369564831256866, "rewards/belief_accuracy/std": 0.0, "reward": 1.0311956405639648, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.7158948183059692, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13125, "step": 105 }, { "loss": 0.0196, "grad_norm": 0.0734815001487732, "learning_rate": 2.6388888888888892e-05, "num_tokens": 254047.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.025800000876188278, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.42342302203178406, "rewards/belief_accuracy/std": 0.0, "reward": 0.8992230296134949, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.490933895111084, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1325, "step": 106 }, { "loss": 0.0328, "grad_norm": 13.114912033081055, "learning_rate": 2.6111111111111114e-05, "num_tokens": 256488.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.26030001044273376, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35887354612350464, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.0691735744476318, "reward_std": 0.018518486991524696, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.8195095211267471, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13375, "step": 107 }, { "loss": 0.0128, "grad_norm": 9.382083892822266, "learning_rate": 2.5833333333333336e-05, "num_tokens": 258950.0, "completions/mean_length": 15.5, "completions/min_length": 11.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.5, "completions/min_terminated_length": 11.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": -0.9452250003814697, "rewards/env_reward/std": 1.3756073713302612, "rewards/belief_accuracy/mean": 0.14408157765865326, "rewards/belief_accuracy/std": 0.24749916791915894, "reward": -0.6886433959007263, "reward_std": 2.2765731811523438, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.31994709372520447, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.135, "step": 108 }, { "loss": 0.0109, "grad_norm": 49.812625885009766, "learning_rate": 2.5555555555555554e-05, "num_tokens": 261418.0, "completions/mean_length": 17.0, "completions/min_length": 10.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 17.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6625750064849854, "rewards/env_reward/std": 0.3948500156402588, "rewards/belief_accuracy/mean": 0.4087354242801666, "rewards/belief_accuracy/std": 0.049438536167144775, "reward": 1.521310567855835, "reward_std": 0.3812306821346283, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.27156141633167863, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13625, "step": 109 }, { "loss": 0.0072, "grad_norm": 2.5267906188964844, "learning_rate": 2.527777777777778e-05, "num_tokens": 263894.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.13199999928474426, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.449174702167511, "rewards/belief_accuracy/std": 0.01414213702082634, "reward": 1.0311747789382935, "reward_std": 0.014142122119665146, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.17876873165369034, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1375, "step": 110 }, { "loss": 0.0016, "grad_norm": 0.0028416195418685675, "learning_rate": 2.5e-05, "num_tokens": 266370.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5741999745368958, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.433454692363739, "rewards/belief_accuracy/std": 0.0, "reward": 1.4576547145843506, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 19.0, "kl": 0.04003394767642021, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13875, "step": 111 }, { "loss": 1.0107, "grad_norm": 1877.4913330078125, "learning_rate": 2.4722222222222223e-05, "num_tokens": 268837.0, "completions/mean_length": 16.75, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20774999260902405, "rewards/env_reward/std": 0.14230000972747803, "rewards/belief_accuracy/mean": 0.31272566318511963, "rewards/belief_accuracy/std": 0.15558846294879913, "reward": 0.9704756736755371, "reward_std": 0.2818722426891327, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 25.26655474305153, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14, "step": 112 }, { "loss": 0.0307, "grad_norm": 10.757664680480957, "learning_rate": 2.4444444444444445e-05, "num_tokens": 271271.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.10530000180006027, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3786514401435852, "rewards/belief_accuracy/std": 0.15512926876544952, "reward": 0.9339514970779419, "reward_std": 0.15512926876544952, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.7676222324371338, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14125, "step": 113 }, { "loss": 0.0135, "grad_norm": 0.01638762652873993, "learning_rate": 2.4166666666666667e-05, "num_tokens": 273711.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5629000067710876, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.20073269307613373, "rewards/belief_accuracy/std": 0.0, "reward": 1.2136327028274536, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.33665189146995544, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1425, "step": 114 }, { "loss": 0.0143, "grad_norm": 4.445044040679932, "learning_rate": 2.3888888888888892e-05, "num_tokens": 276151.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5357999801635742, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4536323547363281, "rewards/belief_accuracy/std": 0.015065711922943592, "reward": 1.4394323825836182, "reward_std": 0.015065747313201427, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3578999266028404, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14375, "step": 115 }, { "loss": 0.021, "grad_norm": 0.026247208938002586, "learning_rate": 2.361111111111111e-05, "num_tokens": 278591.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.41600000858306885, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.20073269307613373, "rewards/belief_accuracy/std": 0.0, "reward": 1.06673264503479, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5255059599876404, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.145, "step": 116 }, { "loss": 0.0191, "grad_norm": 4.0344319343566895, "learning_rate": 2.3333333333333336e-05, "num_tokens": 281045.0, "completions/mean_length": 13.5, "completions/min_length": 10.0, "completions/max_length": 18.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 18.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.8622000217437744, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4052424132823944, "rewards/belief_accuracy/std": 0.06313848495483398, "reward": 1.7174423933029175, "reward_std": 0.06313853710889816, "frac_reward_zero_std": 0.0, "completion_length": 18.0, "kl": 0.4771764427423477, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14625, "step": 117 }, { "loss": 0.0093, "grad_norm": 76.42330169677734, "learning_rate": 2.3055555555555558e-05, "num_tokens": 283512.0, "completions/mean_length": 16.75, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5303999781608582, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4252280294895172, "rewards/belief_accuracy/std": 0.06796582788228989, "reward": 1.405627965927124, "reward_std": 0.06796585023403168, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.23224885016679764, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1475, "step": 118 }, { "loss": 0.0105, "grad_norm": 0.012090546078979969, "learning_rate": 2.277777777777778e-05, "num_tokens": 285952.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.40709999203681946, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4255111813545227, "rewards/belief_accuracy/std": 0.0, "reward": 1.2826112508773804, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.26135221123695374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14875, "step": 119 }, { "loss": 0.021, "grad_norm": 5.484664440155029, "learning_rate": 2.25e-05, "num_tokens": 288392.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.0406000018119812, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.40826135873794556, "rewards/belief_accuracy/std": 0.01560924481600523, "reward": 0.8988614082336426, "reward_std": 0.015609226189553738, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5242489501833916, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15, "step": 120 }, { "loss": 0.0171, "grad_norm": 5.3713908195495605, "learning_rate": 2.2222222222222223e-05, "num_tokens": 290832.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.8973000049591064, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.21689815819263458, "rewards/belief_accuracy/std": 0.03546026349067688, "reward": 1.5641981363296509, "reward_std": 0.035460252314805984, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4266308322548866, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15125, "step": 121 }, { "loss": 0.0185, "grad_norm": 75.12261199951172, "learning_rate": 2.1944444444444445e-05, "num_tokens": 293290.0, "completions/mean_length": 14.5, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.4083000123500824, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.26912587881088257, "rewards/belief_accuracy/std": 0.05725647136569023, "reward": 0.3108258843421936, "reward_std": 0.05725647509098053, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.46135450154542923, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1525, "step": 122 }, { "loss": 0.0181, "grad_norm": 3.971485137939453, "learning_rate": 2.1666666666666667e-05, "num_tokens": 295731.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.43160000443458557, "rewards/env_reward/std": 0.22380001842975616, "rewards/belief_accuracy/mean": 0.3166116774082184, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.198211669921875, "reward_std": 0.2306346893310547, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.45348528772592545, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15375, "step": 123 }, { "loss": 0.0196, "grad_norm": 25.937440872192383, "learning_rate": 2.138888888888889e-05, "num_tokens": 298201.0, "completions/mean_length": 17.5, "completions/min_length": 13.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 17.5, "completions/min_terminated_length": 13.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3862000107765198, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3432316482067108, "rewards/belief_accuracy/std": 0.13312995433807373, "reward": 1.179431676864624, "reward_std": 0.13312996923923492, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.490961529314518, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.155, "step": 124 }, { "loss": 0.0171, "grad_norm": 4.3738932609558105, "learning_rate": 2.111111111111111e-05, "num_tokens": 300641.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.263700008392334, "rewards/env_reward/std": 0.3020000159740448, "rewards/belief_accuracy/mean": 0.3073524236679077, "rewards/belief_accuracy/std": 0.0, "reward": 1.021052360534668, "reward_std": 0.3019999861717224, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4266118183732033, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15625, "step": 125 }, { "loss": 0.0177, "grad_norm": 2.936905860900879, "learning_rate": 2.0833333333333336e-05, "num_tokens": 303081.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1843000054359436, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.34071606397628784, "rewards/belief_accuracy/std": 0.0325876921415329, "reward": 0.9750161170959473, "reward_std": 0.0325876884162426, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4430236220359802, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1575, "step": 126 }, { "loss": 0.0118, "grad_norm": 0.03275100514292717, "learning_rate": 2.0555555555555555e-05, "num_tokens": 305521.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.022199999541044235, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4255111813545227, "rewards/belief_accuracy/std": 0.0, "reward": 0.8977112174034119, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.2940594553947449, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15875, "step": 127 }, { "loss": 0.0133, "grad_norm": 0.03054218925535679, "learning_rate": 2.027777777777778e-05, "num_tokens": 307961.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.18289999663829803, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3570099174976349, "rewards/belief_accuracy/std": 0.0, "reward": 0.9899099469184875, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3323657214641571, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16, "step": 128 }, { "loss": 0.0098, "grad_norm": 2.7979254722595215, "learning_rate": 2e-05, "num_tokens": 310401.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.23109999299049377, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.32913801074028015, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.0102381706237793, "reward_std": 0.01851852796971798, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2442447803914547, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16125, "step": 129 }, { "loss": 0.0157, "grad_norm": 81.0110092163086, "learning_rate": 1.9722222222222224e-05, "num_tokens": 312870.0, "completions/mean_length": 17.25, "completions/min_length": 10.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 17.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.702049970626831, "rewards/env_reward/std": 0.4657484292984009, "rewards/belief_accuracy/mean": 0.42725393176078796, "rewards/belief_accuracy/std": 0.012401506304740906, "reward": 1.5793039798736572, "reward_std": 0.4587002098560333, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.3928138017654419, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1625, "step": 130 }, { "loss": 0.0217, "grad_norm": 0.06868572533130646, "learning_rate": 1.9444444444444445e-05, "num_tokens": 315310.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.031199999153614044, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4255111813545227, "rewards/belief_accuracy/std": 0.0, "reward": 0.906711220741272, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5432239770889282, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16375, "step": 131 }, { "loss": 0.0192, "grad_norm": 0.019419007003307343, "learning_rate": 1.9166666666666667e-05, "num_tokens": 317750.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5879999995231628, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.20073269307613373, "rewards/belief_accuracy/std": 0.0, "reward": 1.2387326955795288, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.48082277178764343, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.165, "step": 132 }, { "loss": 0.0611, "grad_norm": 88.00366973876953, "learning_rate": 1.888888888888889e-05, "num_tokens": 320204.0, "completions/mean_length": 13.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.4107749462127686, "rewards/env_reward/std": 0.745449960231781, "rewards/belief_accuracy/mean": 0.37157177925109863, "rewards/belief_accuracy/std": 0.1919439733028412, "reward": 2.232346773147583, "reward_std": 0.7095935940742493, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 1.527749940752983, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16625, "step": 133 }, { "loss": 0.0172, "grad_norm": 0.016352592036128044, "learning_rate": 1.861111111111111e-05, "num_tokens": 322644.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7775999903678894, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28115594387054443, "rewards/belief_accuracy/std": 0.0, "reward": 1.5087559223175049, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.43096375465393066, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1675, "step": 134 }, { "loss": 0.0164, "grad_norm": 0.042022716253995895, "learning_rate": 1.8333333333333333e-05, "num_tokens": 325084.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.0886000394821167, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.24467593431472778, "rewards/belief_accuracy/std": 0.0, "reward": 1.783276081085205, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4099663197994232, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16875, "step": 135 }, { "loss": 0.0156, "grad_norm": 4.926192760467529, "learning_rate": 1.8055555555555555e-05, "num_tokens": 327551.0, "completions/mean_length": 16.75, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.8907999992370605, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.47779929637908936, "rewards/belief_accuracy/std": 0.004498769994825125, "reward": 1.8185992240905762, "reward_std": 0.004498743452131748, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.3908762261271477, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17, "step": 136 }, { "loss": 0.0089, "grad_norm": 2.6473772525787354, "learning_rate": 1.777777777777778e-05, "num_tokens": 330027.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.1360000371932983, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.47632455825805664, "rewards/belief_accuracy/std": 0.014299660921096802, "reward": 2.0623245239257812, "reward_std": 0.014299631118774414, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.22215565294027328, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17125, "step": 137 }, { "loss": 0.0209, "grad_norm": 2.4564273357391357, "learning_rate": 1.75e-05, "num_tokens": 332275.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.01940000057220459, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4006469249725342, "rewards/belief_accuracy/std": 0.04355309531092644, "reward": 0.8312469720840454, "reward_std": 0.04355309158563614, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5230707004666328, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1725, "step": 138 }, { "loss": 0.0045, "grad_norm": 3.3987679481506348, "learning_rate": 1.7222222222222224e-05, "num_tokens": 334751.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.12020000070333481, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3405015766620636, "rewards/belief_accuracy/std": 0.18090307712554932, "reward": 0.670301616191864, "reward_std": 0.1809031218290329, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.11173266544938087, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17375, "step": 139 }, { "loss": 0.0128, "grad_norm": 2.6056313514709473, "learning_rate": 1.6944444444444446e-05, "num_tokens": 337191.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7328000068664551, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2904151976108551, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.473215103149414, "reward_std": 0.018518567085266113, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3194083571434021, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.175, "step": 140 }, { "loss": 0.0157, "grad_norm": 0.011740055866539478, "learning_rate": 1.6666666666666667e-05, "num_tokens": 339631.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04919999837875366, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.42342302203178406, "rewards/belief_accuracy/std": 0.0, "reward": 0.9226230382919312, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3923957645893097, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17625, "step": 141 }, { "loss": 0.0166, "grad_norm": 2.64264178276062, "learning_rate": 1.638888888888889e-05, "num_tokens": 342071.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.09309999644756317, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.380022794008255, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 0.9231228828430176, "reward_std": 0.018518507480621338, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.41386187076568604, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1775, "step": 142 }, { "loss": 0.0127, "grad_norm": 1.626895785331726, "learning_rate": 1.6111111111111115e-05, "num_tokens": 344511.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4514000117778778, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4132220447063446, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.314621925354004, "reward_std": 0.01851852796971798, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.317696675658226, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17875, "step": 143 }, { "loss": 0.0041, "grad_norm": 3.746852159500122, "learning_rate": 1.5833333333333333e-05, "num_tokens": 346989.0, "completions/mean_length": 19.5, "completions/min_length": 19.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.5, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.33124998211860657, "rewards/env_reward/std": 0.23365363478660583, "rewards/belief_accuracy/mean": 0.4730015695095062, "rewards/belief_accuracy/std": 0.04833333194255829, "reward": 1.2542515993118286, "reward_std": 0.20949891209602356, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.10339224711060524, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18, "step": 144 }, { "loss": 0.0087, "grad_norm": 3.7674238681793213, "learning_rate": 1.5555555555555555e-05, "num_tokens": 349425.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.04360000044107437, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.46380698680877686, "rewards/belief_accuracy/std": 0.03703702986240387, "reward": 0.8702070116996765, "reward_std": 0.037037014961242676, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.21675339713692665, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18125, "step": 145 }, { "loss": 0.0203, "grad_norm": 0.0504443384706974, "learning_rate": 1.527777777777778e-05, "num_tokens": 351357.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.15620000660419464, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4255111813545227, "rewards/belief_accuracy/std": 0.0, "reward": 1.031711220741272, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5085489749908447, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1825, "step": 146 }, { "loss": 0.0193, "grad_norm": 0.02717801183462143, "learning_rate": 1.5e-05, "num_tokens": 353797.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.529699981212616, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28115594387054443, "rewards/belief_accuracy/std": 0.0, "reward": 1.2608559131622314, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.48331472277641296, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18375, "step": 147 }, { "loss": 0.016, "grad_norm": 8.648240089416504, "learning_rate": 1.4722222222222224e-05, "num_tokens": 356264.0, "completions/mean_length": 16.75, "completions/min_length": 13.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.75, "completions/min_terminated_length": 13.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.0004750490188599, "rewards/env_reward/std": 0.5624499917030334, "rewards/belief_accuracy/mean": 0.2962958812713623, "rewards/belief_accuracy/std": 0.19808298349380493, "reward": 1.746770977973938, "reward_std": 0.5813379287719727, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.40032833255827427, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.185, "step": 148 }, { "loss": 0.0115, "grad_norm": 4.142242908477783, "learning_rate": 1.4444444444444444e-05, "num_tokens": 358736.0, "completions/mean_length": 18.0, "completions/min_length": 16.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.0, "completions/min_terminated_length": 16.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": 0.125, "rewards/action_legal/std": 0.75, "rewards/env_reward/mean": 0.012525022029876709, "rewards/env_reward/std": 2.008349895477295, "rewards/belief_accuracy/mean": 0.31194350123405457, "rewards/belief_accuracy/std": 0.34197041392326355, "reward": 0.43696850538253784, "reward_std": 3.024721622467041, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.28697329107671976, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18625, "step": 149 }, { "loss": 0.0224, "grad_norm": 4.6960530281066895, "learning_rate": 1.4166666666666668e-05, "num_tokens": 361176.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4521999955177307, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4280441105365753, "rewards/belief_accuracy/std": 0.011125624179840088, "reward": 1.3302440643310547, "reward_std": 0.011125604622066021, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5598425641655922, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1875, "step": 150 }, { "loss": 0.0129, "grad_norm": 0.022711308673024178, "learning_rate": 1.388888888888889e-05, "num_tokens": 363616.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.725600004196167, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28115594387054443, "rewards/belief_accuracy/std": 0.0, "reward": 1.4567558765411377, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3234124183654785, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18875, "step": 151 }, { "loss": 0.0334, "grad_norm": 87.73027038574219, "learning_rate": 1.3611111111111111e-05, "num_tokens": 366048.0, "completions/mean_length": 8.0, "completions/min_length": 6.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 8.0, "completions/min_terminated_length": 6.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": -0.5, "rewards/format_valid/std": 1.7320507764816284, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.8660253882408142, "rewards/env_reward/mean": -1.32260000705719, "rewards/env_reward/std": 1.9368946552276611, "rewards/belief_accuracy/mean": 0.044990174472332, "rewards/belief_accuracy/std": 0.28329408168792725, "reward": -1.5026097297668457, "reward_std": 2.999246120452881, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.8339565396308899, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19, "step": 152 }, { "loss": 0.0187, "grad_norm": 2.2898941040039062, "learning_rate": 1.3333333333333333e-05, "num_tokens": 368488.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.06650000065565109, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4141637682914734, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 0.9306638240814209, "reward_std": 0.018518507480621338, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4663423076272011, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19125, "step": 153 }, { "loss": 0.0166, "grad_norm": 0.02326902747154236, "learning_rate": 1.3055555555555557e-05, "num_tokens": 370420.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6473000049591064, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43159350752830505, "rewards/belief_accuracy/std": 0.0, "reward": 1.5288934707641602, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4144258499145508, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1925, "step": 154 }, { "loss": 0.0167, "grad_norm": 35.10660171508789, "learning_rate": 1.2777777777777777e-05, "num_tokens": 372880.0, "completions/mean_length": 15.0, "completions/min_length": 11.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.0, "completions/min_terminated_length": 11.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.9100000262260437, "rewards/env_reward/std": 0.664068341255188, "rewards/belief_accuracy/mean": 0.2437037080526352, "rewards/belief_accuracy/std": 0.29311609268188477, "reward": 1.6037037372589111, "reward_std": 0.9571843147277832, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.4181790351867676, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19375, "step": 155 }, { "loss": 0.0024, "grad_norm": 0.00849384069442749, "learning_rate": 1.25e-05, "num_tokens": 375356.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7039999961853027, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.49777403473854065, "rewards/belief_accuracy/std": 0.0, "reward": 1.6517739295959473, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 19.0, "kl": 0.05927176773548126, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.195, "step": 156 }, { "loss": 0.0154, "grad_norm": 0.01303598191589117, "learning_rate": 1.2222222222222222e-05, "num_tokens": 377796.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.47749999165534973, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43159350752830505, "rewards/belief_accuracy/std": 0.0, "reward": 1.3590935468673706, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.38526836037635803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19625, "step": 157 }, { "loss": 0.0168, "grad_norm": 0.026443013921380043, "learning_rate": 1.1944444444444446e-05, "num_tokens": 380236.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04520000144839287, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4255111813545227, "rewards/belief_accuracy/std": 0.0, "reward": 0.9207112193107605, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4196889400482178, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1975, "step": 158 }, { "loss": 0.0162, "grad_norm": 2.1190378665924072, "learning_rate": 1.1666666666666668e-05, "num_tokens": 382676.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6687999963760376, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2904151976108551, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.4092152118682861, "reward_std": 0.018518486991524696, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.40614624321460724, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19875, "step": 159 }, { "loss": 0.0065, "grad_norm": 0.06362934410572052, "learning_rate": 1.138888888888889e-05, "num_tokens": 385152.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6581000089645386, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4977153241634369, "rewards/belief_accuracy/std": 0.0, "reward": 1.6058154106140137, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 19.0, "kl": 0.16141445934772491, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2, "step": 160 }, { "loss": 0.0133, "grad_norm": 0.02474510669708252, "learning_rate": 1.1111111111111112e-05, "num_tokens": 387592.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03180000185966492, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43159350752830505, "rewards/belief_accuracy/std": 0.0, "reward": 0.913393497467041, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.33240804076194763, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20125, "step": 161 }, { "loss": 0.0145, "grad_norm": 0.015563507564365864, "learning_rate": 1.0833333333333334e-05, "num_tokens": 390032.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5896000266075134, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.25078916549682617, "rewards/belief_accuracy/std": 0.0, "reward": 1.2903891801834106, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.36354348063468933, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2025, "step": 162 }, { "loss": 0.0145, "grad_norm": 2.4728994369506836, "learning_rate": 1.0555555555555555e-05, "num_tokens": 392472.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.39559999108314514, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4132220447063446, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.258821964263916, "reward_std": 0.01851852796971798, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3627878651022911, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20375, "step": 163 }, { "loss": 0.0114, "grad_norm": 4.23980188369751, "learning_rate": 1.0277777777777777e-05, "num_tokens": 394912.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.14090000092983246, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.41977328062057495, "rewards/belief_accuracy/std": 0.011475772596895695, "reward": 1.0106734037399292, "reward_std": 0.011475821025669575, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2861982248723507, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.205, "step": 164 }, { "loss": 0.0158, "grad_norm": 0.021868258714675903, "learning_rate": 1e-05, "num_tokens": 397352.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16369999945163727, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3023443818092346, "rewards/belief_accuracy/std": 0.0, "reward": 0.9160443544387817, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.39394283294677734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20625, "step": 165 }, { "loss": 0.0253, "grad_norm": 3.604858636856079, "learning_rate": 9.722222222222223e-06, "num_tokens": 399080.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1818999946117401, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35074275732040405, "rewards/belief_accuracy/std": 0.034780099987983704, "reward": 0.9826427698135376, "reward_std": 0.03478008508682251, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6312557309865952, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2075, "step": 166 }, { "loss": 0.0496, "grad_norm": 5.561725616455078, "learning_rate": 9.444444444444445e-06, "num_tokens": 400808.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2611500024795532, "rewards/env_reward/std": 0.013105858117341995, "rewards/belief_accuracy/mean": 0.3198787569999695, "rewards/belief_accuracy/std": 0.03024062141776085, "reward": 1.0310287475585938, "reward_std": 0.03295842558145523, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.2403500601649284, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20875, "step": 167 }, { "loss": 0.0123, "grad_norm": 5.700604438781738, "learning_rate": 9.166666666666666e-06, "num_tokens": 403281.0, "completions/mean_length": 18.25, "completions/min_length": 16.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.25, "completions/min_terminated_length": 16.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.125900000333786, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4922264814376831, "rewards/belief_accuracy/std": 0.008333340287208557, "reward": 1.0681264400482178, "reward_std": 0.008333325386047363, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.3083411678671837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21, "step": 168 }, { "loss": 0.0181, "grad_norm": 0.016459409147500992, "learning_rate": 8.88888888888889e-06, "num_tokens": 405721.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5353000164031982, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4364381432533264, "rewards/belief_accuracy/std": 0.0, "reward": 1.4217381477355957, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4519377648830414, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21125, "step": 169 }, { "loss": 0.018, "grad_norm": 6.101553916931152, "learning_rate": 8.611111111111112e-06, "num_tokens": 408199.0, "completions/mean_length": 19.5, "completions/min_length": 19.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.5, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.484250009059906, "rewards/env_reward/std": 0.46667224168777466, "rewards/belief_accuracy/mean": 0.4664399027824402, "rewards/belief_accuracy/std": 0.03808803856372833, "reward": 1.400689959526062, "reward_std": 0.5047602653503418, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.4495365619659424, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2125, "step": 170 }, { "loss": 0.0143, "grad_norm": 0.00956246443092823, "learning_rate": 8.333333333333334e-06, "num_tokens": 410639.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.19290000200271606, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4364381432533264, "rewards/belief_accuracy/std": 0.0, "reward": 1.0793381929397583, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3575763404369354, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21375, "step": 171 }, { "loss": 0.0181, "grad_norm": 3.3567819595336914, "learning_rate": 8.055555555555557e-06, "num_tokens": 413079.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7131999731063843, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2415299117565155, "rewards/belief_accuracy/std": 0.018518514931201935, "reward": 1.4047298431396484, "reward_std": 0.01851852796971798, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.45299722999334335, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.215, "step": 172 }, { "loss": 0.0133, "grad_norm": 7.245166778564453, "learning_rate": 7.777777777777777e-06, "num_tokens": 415530.0, "completions/mean_length": 13.75, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.1923999935388565, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.45499998331069946, "rewards/belief_accuracy/std": 0.04879846051335335, "reward": 0.7125999927520752, "reward_std": 0.04879846051335335, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.3327266201376915, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21625, "step": 173 }, { "loss": 0.0152, "grad_norm": 0.04452874884009361, "learning_rate": 7.5e-06, "num_tokens": 417970.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1501999944448471, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3198787569999695, "rewards/belief_accuracy/std": 0.0, "reward": 0.9200787544250488, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.37988075613975525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2175, "step": 174 }, { "loss": 0.0176, "grad_norm": 4.498856544494629, "learning_rate": 7.222222222222222e-06, "num_tokens": 420034.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.38989999890327454, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4527832865715027, "rewards/belief_accuracy/std": 0.012822061777114868, "reward": 1.2926833629608154, "reward_std": 0.01282203197479248, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4394312873482704, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21875, "step": 175 }, { "loss": 0.0156, "grad_norm": 0.023712527006864548, "learning_rate": 6.944444444444445e-06, "num_tokens": 422474.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6894000172615051, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28115594387054443, "rewards/belief_accuracy/std": 0.0, "reward": 1.4205559492111206, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3908577561378479, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22, "step": 176 }, { "loss": 0.0121, "grad_norm": 1.7105811834335327, "learning_rate": 6.666666666666667e-06, "num_tokens": 424914.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16120000183582306, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35887354612350464, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 0.9700735807418823, "reward_std": 0.018518507480621338, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3014954552054405, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22125, "step": 177 }, { "loss": 0.0044, "grad_norm": 0.1105613261461258, "learning_rate": 6.3888888888888885e-06, "num_tokens": 427390.0, "completions/mean_length": 19.0, "completions/min_length": 19.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.0, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.885699987411499, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.49926915764808655, "rewards/belief_accuracy/std": 0.0, "reward": 2.8349690437316895, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 19.0, "kl": 0.10958900302648544, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2225, "step": 178 }, { "loss": 0.019, "grad_norm": 2.3042142391204834, "learning_rate": 6.111111111111111e-06, "num_tokens": 429830.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.45100000500679016, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4132220447063446, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.3142220973968506, "reward_std": 0.018518486991524696, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4740464314818382, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22375, "step": 179 }, { "loss": 0.0875, "grad_norm": 6.963607311248779, "learning_rate": 5.833333333333334e-06, "num_tokens": 432269.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6017500162124634, "rewards/env_reward/std": 0.2969000041484833, "rewards/belief_accuracy/mean": 0.16290634870529175, "rewards/belief_accuracy/std": 0.0, "reward": 1.2146563529968262, "reward_std": 0.2969000041484833, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 2.186647579073906, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.225, "step": 180 }, { "loss": 0.0161, "grad_norm": 0.004897190723568201, "learning_rate": 5.555555555555556e-06, "num_tokens": 434709.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.567300021648407, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.30849888920783997, "rewards/belief_accuracy/std": 0.0, "reward": 1.3257989883422852, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4025897979736328, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22625, "step": 181 }, { "loss": 0.0126, "grad_norm": 0.025149773806333542, "learning_rate": 5.277777777777778e-06, "num_tokens": 437149.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.34290000796318054, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3198787569999695, "rewards/belief_accuracy/std": 0.0, "reward": 1.1127787828445435, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3142901659011841, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2275, "step": 182 }, { "loss": 0.0146, "grad_norm": 3.714738130569458, "learning_rate": 5e-06, "num_tokens": 439009.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3617999851703644, "rewards/env_reward/std": 0.6311999559402466, "rewards/belief_accuracy/mean": 0.4503726065158844, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.2621725797653198, "reward_std": 0.6376118659973145, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3641320765018463, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22875, "step": 183 }, { "loss": 0.0145, "grad_norm": 0.011211546137928963, "learning_rate": 4.722222222222222e-06, "num_tokens": 441449.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.41929998993873596, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3073524236679077, "rewards/belief_accuracy/std": 0.0, "reward": 1.176652431488037, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3627232015132904, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23, "step": 184 }, { "loss": 0.0126, "grad_norm": 0.02891082689166069, "learning_rate": 4.444444444444445e-06, "num_tokens": 443749.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4074999988079071, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.30849888920783997, "rewards/belief_accuracy/std": 0.0, "reward": 1.165998935699463, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.314136266708374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23125, "step": 185 }, { "loss": 0.0141, "grad_norm": 0.006373639218509197, "learning_rate": 4.166666666666667e-06, "num_tokens": 446189.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.39590001106262207, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.30849888920783997, "rewards/belief_accuracy/std": 0.0, "reward": 1.1543989181518555, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3533334732055664, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2325, "step": 186 }, { "loss": 0.0144, "grad_norm": 0.004680258687585592, "learning_rate": 3.888888888888889e-06, "num_tokens": 448629.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.05490000173449516, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4364381432533264, "rewards/belief_accuracy/std": 0.0, "reward": 0.9413381814956665, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.36052942276000977, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23375, "step": 187 }, { "loss": 0.0313, "grad_norm": 10.884200096130371, "learning_rate": 3.611111111111111e-06, "num_tokens": 451066.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.28040000796318054, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.285047709941864, "rewards/belief_accuracy/std": 0.15778866410255432, "reward": 1.0154476165771484, "reward_std": 0.15778863430023193, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.7835219278931618, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.235, "step": 188 }, { "loss": 0.0168, "grad_norm": 7.003281593322754, "learning_rate": 3.3333333333333333e-06, "num_tokens": 453506.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.007699966430664, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3920290470123291, "rewards/belief_accuracy/std": 0.03703702986240387, "reward": 1.8497289419174194, "reward_std": 0.03703709691762924, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4188368022441864, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23625, "step": 189 }, { "loss": 0.0736, "grad_norm": 10.276236534118652, "learning_rate": 3.0555555555555556e-06, "num_tokens": 455984.0, "completions/mean_length": 19.5, "completions/min_length": 19.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 19.5, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 21.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.7950749397277832, "rewards/env_reward/std": 0.3755500018596649, "rewards/belief_accuracy/mean": 0.4963931441307068, "rewards/belief_accuracy/std": 0.0, "reward": 0.15131822228431702, "reward_std": 0.37554997205734253, "frac_reward_zero_std": 0.0, "completion_length": 21.0, "kl": 1.8405264019966125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2375, "step": 190 }, { "loss": 0.0141, "grad_norm": 4.917500972747803, "learning_rate": 2.777777777777778e-06, "num_tokens": 458284.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5060999989509583, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3383972644805908, "rewards/belief_accuracy/std": 0.03703702986240387, "reward": 1.2944972515106201, "reward_std": 0.037037014961242676, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.35147424787282944, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23875, "step": 191 }, { "loss": 0.0237, "grad_norm": 12.188007354736328, "learning_rate": 2.5e-06, "num_tokens": 460395.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5211250185966492, "rewards/env_reward/std": 0.019750013947486877, "rewards/belief_accuracy/mean": 0.3198787569999695, "rewards/belief_accuracy/std": 0.0, "reward": 1.291003704071045, "reward_std": 0.019749999046325684, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5920824334025383, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24, "step": 192 }, { "loss": 0.0165, "grad_norm": 0.019955696538090706, "learning_rate": 2.2222222222222225e-06, "num_tokens": 462835.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04830000177025795, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43159350752830505, "rewards/belief_accuracy/std": 0.0, "reward": 0.9298934936523438, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.41180703043937683, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24125, "step": 193 }, { "loss": 0.0146, "grad_norm": 5.3022541999816895, "learning_rate": 1.9444444444444444e-06, "num_tokens": 465275.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.07689999788999557, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.46380698680877686, "rewards/belief_accuracy/std": 0.02138333022594452, "reward": 0.8369070291519165, "reward_std": 0.02138333022594452, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3642955869436264, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2425, "step": 194 }, { "loss": 0.0225, "grad_norm": 0.005025716498494148, "learning_rate": 1.6666666666666667e-06, "num_tokens": 467715.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.0786999985575676, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.42342302203178406, "rewards/belief_accuracy/std": 0.0, "reward": 0.9521230459213257, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5623220801353455, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24375, "step": 195 }, { "loss": 0.1786, "grad_norm": 21.385347366333008, "learning_rate": 1.388888888888889e-06, "num_tokens": 470156.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5320000052452087, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.20086660981178284, "rewards/belief_accuracy/std": 0.0002678185701370239, "reward": 1.1828665733337402, "reward_std": 0.00026782354689203203, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 4.466357246041298, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.245, "step": 196 }, { "loss": 0.0325, "grad_norm": 3.4839556217193604, "learning_rate": 1.1111111111111112e-06, "num_tokens": 472220.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.07050000131130219, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43227022886276245, "rewards/belief_accuracy/std": 0.07407407462596893, "reward": 0.8117702007293701, "reward_std": 0.07407406717538834, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.8131380379199982, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24625, "step": 197 }, { "loss": 0.0443, "grad_norm": 1.125584602355957, "learning_rate": 8.333333333333333e-07, "num_tokens": 474660.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20149999856948853, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2992396354675293, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 0.9507396221160889, "reward_std": 0.01851852796971798, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.1076267883181572, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2475, "step": 198 }, { "loss": 0.0145, "grad_norm": 0.013783140107989311, "learning_rate": 5.555555555555556e-07, "num_tokens": 477100.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.05009999871253967, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3892820477485657, "rewards/belief_accuracy/std": 0.0, "reward": 0.8893821239471436, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.36227384209632874, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24875, "step": 199 }, { "loss": 0.0208, "grad_norm": 0.012795673683285713, "learning_rate": 2.777777777777778e-07, "num_tokens": 479540.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.5, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.17980000376701355, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3023443818092346, "rewards/belief_accuracy/std": 0.0, "reward": 0.9321444034576416, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5199949741363525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25, "step": 200 }, { "train_runtime": 774.5612, "train_samples_per_second": 1.033, "train_steps_per_second": 0.258, "total_flos": 0.0, "train_loss": 0.024849860128597356, "epoch": 0.25, "step": 200 } ]