Files
rhythm-env-meta-trained-iter1/checkpoint-200/trainer_state.json
ModelHub XC 02f436bc80 初始化项目,由ModelHub XC社区提供模型
Model: InosLihka/rhythm-env-meta-trained-iter1
Source: Original Platform
2026-05-16 23:06:03 +08:00

6635 lines
236 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 12.75,
"completions/mean_terminated_length": 12.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.00125,
"frac_reward_zero_std": 0.0,
"grad_norm": 50.69855880737305,
"kl": 0.036144825629889965,
"learning_rate": 0.0,
"loss": 0.0014,
"num_tokens": 2451.0,
"reward": 0.7203128337860107,
"reward_std": 0.14423373341560364,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.24216286838054657,
"rewards/belief_accuracy/std": 0.1988486498594284,
"rewards/env_reward/mean": 0.028150001540780067,
"rewards/env_reward/std": 0.07374775409698486,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0025,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.570699691772461,
"kl": 0.0023617089027538896,
"learning_rate": 2.5e-06,
"loss": 0.0001,
"num_tokens": 4891.0,
"reward": 0.2766309082508087,
"reward_std": 0.5998044610023499,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.10680588334798813,
"rewards/belief_accuracy/std": 0.10040652006864548,
"rewards/env_reward/mean": -0.28017497062683105,
"rewards/env_reward/std": 0.6506500244140625,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 21.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 14.75,
"completions/mean_terminated_length": 14.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.00375,
"frac_reward_zero_std": 0.0,
"grad_norm": 137.97557067871094,
"kl": 0.039296648057643324,
"learning_rate": 5e-06,
"loss": 0.0016,
"num_tokens": 7350.0,
"reward": 1.2048746347427368,
"reward_std": 0.21250373125076294,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3351996839046478,
"rewards/belief_accuracy/std": 0.11216925829648972,
"rewards/env_reward/mean": 0.41967499256134033,
"rewards/env_reward/std": 0.29905566573143005,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 17.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 17.0,
"completions/max_terminated_length": 17.0,
"completions/mean_length": 13.25,
"completions/mean_terminated_length": 13.25,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.005,
"frac_reward_zero_std": 0.0,
"grad_norm": 31.324565887451172,
"kl": 0.058574457885697484,
"learning_rate": 7.5e-06,
"loss": 0.0023,
"num_tokens": 9803.0,
"reward": -0.5865602493286133,
"reward_std": 2.3446295261383057,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.05693966522812843,
"rewards/belief_accuracy/std": 0.20072676241397858,
"rewards/env_reward/mean": -0.7559999823570251,
"rewards/env_reward/std": 1.496000051498413,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 8.25,
"completions/mean_terminated_length": 8.25,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.00625,
"frac_reward_zero_std": 0.0,
"grad_norm": 89.02346801757812,
"kl": 0.0644997438066639,
"learning_rate": 1e-05,
"loss": 0.0026,
"num_tokens": 12236.0,
"reward": -1.5658241510391235,
"reward_std": 2.9457480907440186,
"rewards/action_legal/mean": -0.25,
"rewards/action_legal/std": 0.8660253882408142,
"rewards/belief_accuracy/mean": 0.015800803899765015,
"rewards/belief_accuracy/std": 0.26350587606430054,
"rewards/env_reward/mean": -1.356624960899353,
"rewards/env_reward/std": 1.9143962860107422,
"rewards/format_valid/mean": -0.5,
"rewards/format_valid/std": 1.7320507764816284,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 12.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 10.75,
"completions/mean_terminated_length": 10.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0075,
"frac_reward_zero_std": 0.0,
"grad_norm": 106.08165740966797,
"kl": 0.053889136761426926,
"learning_rate": 1.25e-05,
"loss": 0.0022,
"num_tokens": 14679.0,
"reward": -0.519845187664032,
"reward_std": 2.4074840545654297,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.16155476868152618,
"rewards/belief_accuracy/std": 0.25382331013679504,
"rewards/env_reward/mean": -0.7939000129699707,
"rewards/env_reward/std": 1.4925646781921387,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.00875,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7754714488983154,
"kl": 0.006218573806108907,
"learning_rate": 1.5e-05,
"loss": 0.0002,
"num_tokens": 17117.0,
"reward": 0.8104138374328613,
"reward_std": 0.08244366198778152,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.08886384963989258,
"rewards/belief_accuracy/std": 0.13198231160640717,
"rewards/env_reward/mean": 0.2715499997138977,
"rewards/env_reward/std": 0.10536643117666245,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 11.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 10.25,
"completions/mean_terminated_length": 10.25,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.01,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.92833137512207,
"kl": 0.005196493031689897,
"learning_rate": 1.75e-05,
"loss": 0.0002,
"num_tokens": 19558.0,
"reward": 0.8396382927894592,
"reward_std": 0.9627154469490051,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.1838883012533188,
"rewards/belief_accuracy/std": 0.13176093995571136,
"rewards/env_reward/mean": 0.2057500183582306,
"rewards/env_reward/std": 1.0211000442504883,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 14.75,
"completions/mean_terminated_length": 14.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.01125,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.379985809326172,
"kl": 0.008396895253099501,
"learning_rate": 2e-05,
"loss": 0.0003,
"num_tokens": 22017.0,
"reward": -0.6789295077323914,
"reward_std": 2.286468744277954,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.11377047002315521,
"rewards/belief_accuracy/std": 0.26322486996650696,
"rewards/env_reward/mean": -0.9052000045776367,
"rewards/env_reward/std": 1.3965555429458618,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.207470893859863,
"kl": 0.008475569004076533,
"learning_rate": 2.25e-05,
"loss": 0.0003,
"num_tokens": 23876.0,
"reward": 0.5723661780357361,
"reward_std": 0.11954429745674133,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3318161964416504,
"rewards/belief_accuracy/std": 0.13663487136363983,
"rewards/env_reward/mean": -0.20945000648498535,
"rewards/env_reward/std": 0.026100002229213715,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.25,
"completions/mean_terminated_length": 9.25,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.01375,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.754448890686035,
"kl": 0.0007791817042743787,
"learning_rate": 2.5e-05,
"loss": 0.0,
"num_tokens": 26313.0,
"reward": 0.87220299243927,
"reward_std": 0.18807174265384674,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.21230295300483704,
"rewards/belief_accuracy/std": 0.07961412519216537,
"rewards/env_reward/mean": 0.20990000665187836,
"rewards/env_reward/std": 0.1363999992609024,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 14.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 14.0,
"completions/max_terminated_length": 14.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.015,
"frac_reward_zero_std": 0.0,
"grad_norm": 21.532686233520508,
"kl": 0.03962589804723393,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.0016,
"num_tokens": 28385.0,
"reward": 0.9086348414421082,
"reward_std": 0.2632542550563812,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2661348283290863,
"rewards/belief_accuracy/std": 0.16707739233970642,
"rewards/env_reward/mean": 0.19249999523162842,
"rewards/env_reward/std": 0.16419841349124908,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 21.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 15.25,
"completions/mean_terminated_length": 15.25,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.01625,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.38393497467041,
"kl": 0.09559060208266601,
"learning_rate": 3e-05,
"loss": 0.0038,
"num_tokens": 30846.0,
"reward": 0.7065698504447937,
"reward_std": 0.5689817667007446,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.32079488039016724,
"rewards/belief_accuracy/std": 0.32852134108543396,
"rewards/env_reward/mean": -0.06422500312328339,
"rewards/env_reward/std": 0.4030914902687073,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 15.5,
"completions/mean_terminated_length": 15.5,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0175,
"frac_reward_zero_std": 0.0,
"grad_norm": 61.281898498535156,
"kl": 0.061171281384304166,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.0024,
"num_tokens": 33308.0,
"reward": -2.379248857498169,
"reward_std": 1.9871822595596313,
"rewards/action_legal/mean": -0.25,
"rewards/action_legal/std": 0.8660253882408142,
"rewards/belief_accuracy/mean": 0.0016010589897632599,
"rewards/belief_accuracy/std": 0.2347448617219925,
"rewards/env_reward/mean": -2.1558499336242676,
"rewards/env_reward/std": 0.9747405052185059,
"rewards/format_valid/mean": -0.5,
"rewards/format_valid/std": 1.7320507764816284,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.25,
"completions/mean_terminated_length": 9.25,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.01875,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.862137794494629,
"kl": 0.12338710279436782,
"learning_rate": 3.5e-05,
"loss": 0.0049,
"num_tokens": 35033.0,
"reward": 0.5028390884399414,
"reward_std": 0.36507099866867065,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.16708904504776,
"rewards/belief_accuracy/std": 0.07635380327701569,
"rewards/env_reward/mean": -0.11425000429153442,
"rewards/env_reward/std": 0.29398712515830994,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 15.0,
"completions/mean_terminated_length": 15.0,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.02,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.376911163330078,
"kl": 0.2386501464061439,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0095,
"num_tokens": 37493.0,
"reward": 0.49063006043434143,
"reward_std": 0.3236295282840729,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.34598004817962646,
"rewards/belief_accuracy/std": 0.10989798605442047,
"rewards/env_reward/mean": -0.30535000562667847,
"rewards/env_reward/std": 0.29750001430511475,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.02125,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.434854507446289,
"kl": 0.052817441552178934,
"learning_rate": 4e-05,
"loss": 0.0021,
"num_tokens": 39931.0,
"reward": -0.5519770383834839,
"reward_std": 2.3723113536834717,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.029872901737689972,
"rewards/belief_accuracy/std": 0.17124304175376892,
"rewards/env_reward/mean": -0.694350004196167,
"rewards/env_reward/std": 1.5409032106399536,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0225,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.025228977203369,
"kl": 0.007609898108057678,
"learning_rate": 4.25e-05,
"loss": 0.0003,
"num_tokens": 42371.0,
"reward": 1.3588334321975708,
"reward_std": 0.29131758213043213,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3628334403038025,
"rewards/belief_accuracy/std": 0.06860831379890442,
"rewards/env_reward/mean": 0.5460000038146973,
"rewards/env_reward/std": 0.2443346381187439,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 11.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 10.25,
"completions/mean_terminated_length": 10.25,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.02375,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.366896629333496,
"kl": 0.17784105247119442,
"learning_rate": 4.5e-05,
"loss": 0.0071,
"num_tokens": 44812.0,
"reward": 0.3633672595024109,
"reward_std": 1.0452975034713745,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.18611717224121094,
"rewards/belief_accuracy/std": 0.1490413397550583,
"rewards/env_reward/mean": -0.27274996042251587,
"rewards/env_reward/std": 1.072299599647522,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 14.25,
"completions/mean_terminated_length": 14.25,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.025,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.6515116691589355,
"kl": 0.6034408716950566,
"learning_rate": 4.75e-05,
"loss": 0.0241,
"num_tokens": 47269.0,
"reward": 0.7194468975067139,
"reward_std": 0.6191092133522034,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.35667186975479126,
"rewards/belief_accuracy/std": 0.14748485386371613,
"rewards/env_reward/mean": -0.08722500503063202,
"rewards/env_reward/std": 0.626167356967926,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 14.5,
"completions/mean_terminated_length": 14.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.02625,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.213253021240234,
"kl": 0.33139822795055807,
"learning_rate": 5e-05,
"loss": 0.0133,
"num_tokens": 49727.0,
"reward": -1.8539845943450928,
"reward_std": 2.6750690937042236,
"rewards/action_legal/mean": -0.25,
"rewards/action_legal/std": 0.8660253882408142,
"rewards/belief_accuracy/mean": 0.07274026423692703,
"rewards/belief_accuracy/std": 0.3264457583427429,
"rewards/env_reward/mean": -1.7017250061035156,
"rewards/env_reward/std": 1.6725139617919922,
"rewards/format_valid/mean": -0.5,
"rewards/format_valid/std": 1.7320507764816284,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 12.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 10.75,
"completions/mean_terminated_length": 10.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0275,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.38259220123291,
"kl": 0.6713134994497523,
"learning_rate": 4.972222222222223e-05,
"loss": 0.0269,
"num_tokens": 52170.0,
"reward": 0.6025224924087524,
"reward_std": 0.7518510222434998,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.23837246000766754,
"rewards/belief_accuracy/std": 0.15508003532886505,
"rewards/env_reward/mean": -0.08584998548030853,
"rewards/env_reward/std": 0.7486812472343445,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.02875,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.713683605194092,
"kl": 0.4507274613715708,
"learning_rate": 4.9444444444444446e-05,
"loss": 0.018,
"num_tokens": 54618.0,
"reward": 0.6149874925613403,
"reward_std": 0.34224066138267517,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.34318745136260986,
"rewards/belief_accuracy/std": 0.13306953012943268,
"rewards/env_reward/mean": -0.17819999158382416,
"rewards/env_reward/std": 0.41936323046684265,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 21.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 14.75,
"completions/mean_terminated_length": 14.75,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.03,
"frac_reward_zero_std": 0.0,
"grad_norm": 233.1907501220703,
"kl": 0.5501463627442718,
"learning_rate": 4.9166666666666665e-05,
"loss": 0.022,
"num_tokens": 57077.0,
"reward": -2.9238851070404053,
"reward_std": 2.352229595184326,
"rewards/action_legal/mean": -0.625,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": -0.052610140293836594,
"rewards/belief_accuracy/std": 0.2947797477245331,
"rewards/env_reward/mean": -2.308774948120117,
"rewards/env_reward/std": 1.382449984550476,
"rewards/format_valid/mean": -1.25,
"rewards/format_valid/std": 1.5,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.147825241088867,
"kl": 0.6739767706021667,
"learning_rate": 4.888888888888889e-05,
"loss": 0.027,
"num_tokens": 59515.0,
"reward": 0.484980046749115,
"reward_std": 0.7028239965438843,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.33073002099990845,
"rewards/belief_accuracy/std": 0.0814066082239151,
"rewards/env_reward/mean": -0.2957499921321869,
"rewards/env_reward/std": 0.6496629118919373,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.25,
"completions/mean_terminated_length": 9.25,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0325,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.604952812194824,
"kl": 0.7961917221546173,
"learning_rate": 4.8611111111111115e-05,
"loss": 0.0318,
"num_tokens": 61952.0,
"reward": 0.6069967746734619,
"reward_std": 0.0925484374165535,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.23919677734375,
"rewards/belief_accuracy/std": 0.09254845231771469,
"rewards/env_reward/mean": -0.08219999819993973,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 14.0,
"completions/mean_terminated_length": 14.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.03375,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.258967399597168,
"kl": 0.6395482331572566,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.0256,
"num_tokens": 64408.0,
"reward": 0.7796441912651062,
"reward_std": 0.7857587933540344,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3164691925048828,
"rewards/belief_accuracy/std": 0.18967638909816742,
"rewards/env_reward/mean": 0.013175025582313538,
"rewards/env_reward/std": 0.7235533595085144,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.035,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.232230186462402,
"kl": 0.7461551874876022,
"learning_rate": 4.805555555555556e-05,
"loss": 0.0298,
"num_tokens": 66706.0,
"reward": 1.1603848934173584,
"reward_std": 0.23410998284816742,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2731848359107971,
"rewards/belief_accuracy/std": 0.09957370162010193,
"rewards/env_reward/mean": 0.43720000982284546,
"rewards/env_reward/std": 0.28588902950286865,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.03625,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.738325595855713,
"kl": 0.28069217689335346,
"learning_rate": 4.7777777777777784e-05,
"loss": 0.0112,
"num_tokens": 69146.0,
"reward": 0.8146038055419922,
"reward_std": 0.2780461311340332,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.01810377836227417,
"rewards/belief_accuracy/std": 0.052378278225660324,
"rewards/env_reward/mean": 0.3465000092983246,
"rewards/env_reward/std": 0.2543998062610626,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 12.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 11.5,
"completions/mean_terminated_length": 11.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0375,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.730031967163086,
"kl": 0.35769763961434364,
"learning_rate": 4.75e-05,
"loss": 0.0143,
"num_tokens": 71592.0,
"reward": 0.9769262671470642,
"reward_std": 0.8466672897338867,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3416762351989746,
"rewards/belief_accuracy/std": 0.08965588361024857,
"rewards/env_reward/mean": 0.18525001406669617,
"rewards/env_reward/std": 0.8912999629974365,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.03875,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.303464412689209,
"kl": 0.2592179449275136,
"learning_rate": 4.722222222222222e-05,
"loss": 0.0104,
"num_tokens": 74031.0,
"reward": 0.9595953226089478,
"reward_std": 0.08771299570798874,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.21054524183273315,
"rewards/belief_accuracy/std": 0.09055177122354507,
"rewards/env_reward/mean": 0.299049973487854,
"rewards/env_reward/std": 0.04009999334812164,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.62190055847168,
"kl": 0.48394265957176685,
"learning_rate": 4.6944444444444446e-05,
"loss": 0.0194,
"num_tokens": 76469.0,
"reward": 0.9655871391296387,
"reward_std": 0.5988060832023621,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.05113711208105087,
"rewards/belief_accuracy/std": 0.0761847198009491,
"rewards/env_reward/mean": 0.46445000171661377,
"rewards/env_reward/std": 0.5925008654594421,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.04125,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.997323036193848,
"kl": 0.28875103616155684,
"learning_rate": 4.666666666666667e-05,
"loss": 0.0115,
"num_tokens": 78908.0,
"reward": 0.8634432554244995,
"reward_std": 0.2403155416250229,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.23964327573776245,
"rewards/belief_accuracy/std": 0.16614758968353271,
"rewards/env_reward/mean": 0.1738000065088272,
"rewards/env_reward/std": 0.15648801624774933,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0425,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.761037826538086,
"kl": 0.6490175630897284,
"learning_rate": 4.638888888888889e-05,
"loss": 0.026,
"num_tokens": 80634.0,
"reward": 1.2778434753417969,
"reward_std": 0.5499616265296936,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.18821844458580017,
"rewards/belief_accuracy/std": 0.017153441905975342,
"rewards/env_reward/mean": 0.6396250128746033,
"rewards/env_reward/std": 0.5599880218505859,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.25,
"completions/mean_terminated_length": 9.25,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.04375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.997340202331543,
"kl": 0.8445844687521458,
"learning_rate": 4.6111111111111115e-05,
"loss": 0.0338,
"num_tokens": 83071.0,
"reward": 1.034692406654358,
"reward_std": 0.3406420350074768,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.09596741199493408,
"rewards/belief_accuracy/std": 0.11219175159931183,
"rewards/env_reward/mean": 0.4887250065803528,
"rewards/env_reward/std": 0.24987412989139557,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 21.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 14.0,
"completions/mean_terminated_length": 14.0,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.045,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.503076553344727,
"kl": 0.7940250784158707,
"learning_rate": 4.5833333333333334e-05,
"loss": 0.0318,
"num_tokens": 85527.0,
"reward": 0.024980902671813965,
"reward_std": 0.5933173894882202,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.27033090591430664,
"rewards/belief_accuracy/std": 0.22744685411453247,
"rewards/env_reward/mean": -0.6953500509262085,
"rewards/env_reward/std": 0.7904999852180481,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.04625,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.819326877593994,
"kl": 0.7724389061331749,
"learning_rate": 4.555555555555556e-05,
"loss": 0.0309,
"num_tokens": 87957.0,
"reward": 0.9745460152626038,
"reward_std": 0.026238948106765747,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.28547099232673645,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.23907500505447388,
"rewards/env_reward/std": 0.026238946244120598,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0475,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8090322017669678,
"kl": 0.08815138379577547,
"learning_rate": 4.527777777777778e-05,
"loss": 0.0035,
"num_tokens": 90397.0,
"reward": 0.8214725255966187,
"reward_std": 0.10255800932645798,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.336172491312027,
"rewards/belief_accuracy/std": 0.10255803912878036,
"rewards/env_reward/mean": 0.03530000150203705,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.04875,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.734916687011719,
"kl": 0.5696801505982876,
"learning_rate": 4.5e-05,
"loss": 0.0228,
"num_tokens": 92640.0,
"reward": 0.8142017722129822,
"reward_std": 0.11311425268650055,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.31915175914764404,
"rewards/belief_accuracy/std": 0.09154129028320312,
"rewards/env_reward/mean": 0.0450499951839447,
"rewards/env_reward/std": 0.1559000015258789,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 21.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 14.0,
"completions/mean_terminated_length": 14.0,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.05,
"frac_reward_zero_std": 0.0,
"grad_norm": 90.21479034423828,
"kl": 0.8432229831814766,
"learning_rate": 4.472222222222223e-05,
"loss": 0.0337,
"num_tokens": 95096.0,
"reward": 0.5240219831466675,
"reward_std": 0.11077450960874557,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3270469307899475,
"rewards/belief_accuracy/std": 0.11481481790542603,
"rewards/env_reward/mean": -0.25302499532699585,
"rewards/env_reward/std": 0.014749996364116669,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.05125,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.006437301635742,
"kl": 0.6199327223002911,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.0248,
"num_tokens": 97206.0,
"reward": 1.490286946296692,
"reward_std": 0.1738055795431137,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2870370149612427,
"rewards/belief_accuracy/std": 0.11331170797348022,
"rewards/env_reward/mean": 0.753250002861023,
"rewards/env_reward/std": 0.12153223156929016,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 13.75,
"completions/mean_terminated_length": 13.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0525,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4820449352264404,
"kl": 0.9688376598060131,
"learning_rate": 4.4166666666666665e-05,
"loss": 0.0388,
"num_tokens": 99661.0,
"reward": 0.6457340717315674,
"reward_std": 0.43696078658103943,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.1046341061592102,
"rewards/belief_accuracy/std": 0.3321184515953064,
"rewards/env_reward/mean": 0.09109999984502792,
"rewards/env_reward/std": 0.4848000109195709,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.05375,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.73534870147705,
"kl": 0.5111057488247752,
"learning_rate": 4.388888888888889e-05,
"loss": 0.0204,
"num_tokens": 102100.0,
"reward": 1.3648006916046143,
"reward_std": 0.39195773005485535,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3190256357192993,
"rewards/belief_accuracy/std": 0.10691672563552856,
"rewards/env_reward/mean": 0.5957750082015991,
"rewards/env_reward/std": 0.364950031042099,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 12.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 11.25,
"completions/mean_terminated_length": 11.25,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.055,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.246126174926758,
"kl": 0.5820450782775879,
"learning_rate": 4.3611111111111116e-05,
"loss": 0.0233,
"num_tokens": 104545.0,
"reward": 0.44118252396583557,
"reward_std": 0.2176763415336609,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": -0.029667485505342484,
"rewards/belief_accuracy/std": 0.035023655742406845,
"rewards/env_reward/mean": 0.02084999904036522,
"rewards/env_reward/std": 0.25270000100135803,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.25,
"completions/mean_terminated_length": 9.25,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.05625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.163407325744629,
"kl": 0.6694209240376949,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.0268,
"num_tokens": 106982.0,
"reward": 0.571253776550293,
"reward_std": 0.33365941047668457,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2724537253379822,
"rewards/belief_accuracy/std": 0.03546027094125748,
"rewards/env_reward/mean": -0.15120001137256622,
"rewards/env_reward/std": 0.33800002932548523,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 12.75,
"completions/mean_terminated_length": 12.75,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.0575,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.127840042114258,
"kl": 1.121412256732583,
"learning_rate": 4.305555555555556e-05,
"loss": 0.0449,
"num_tokens": 109433.0,
"reward": 1.0277390480041504,
"reward_std": 0.8693292737007141,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3392890393733978,
"rewards/belief_accuracy/std": 0.12074092775583267,
"rewards/env_reward/mean": 0.23845000565052032,
"rewards/env_reward/std": 0.7578877210617065,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 11.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 10.5,
"completions/mean_terminated_length": 10.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.05875,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.611600875854492,
"kl": 0.673852413892746,
"learning_rate": 4.277777777777778e-05,
"loss": 0.027,
"num_tokens": 111875.0,
"reward": 1.4283732175827026,
"reward_std": 0.8459944128990173,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.17277316749095917,
"rewards/belief_accuracy/std": 0.20537562668323517,
"rewards/env_reward/mean": 0.8055999875068665,
"rewards/env_reward/std": 0.8989343643188477,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 18.0,
"completions/mean_terminated_length": 18.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.06,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.983867168426514,
"kl": 0.37127236742526293,
"learning_rate": 4.25e-05,
"loss": 0.0149,
"num_tokens": 114347.0,
"reward": 0.4236607849597931,
"reward_std": 0.28222158551216125,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.1646607667207718,
"rewards/belief_accuracy/std": 0.1431625336408615,
"rewards/env_reward/mean": -0.19099999964237213,
"rewards/env_reward/std": 0.4099873900413513,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.06125,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.0399808883667,
"kl": 2.329849496483803,
"learning_rate": 4.222222222222222e-05,
"loss": 0.0932,
"num_tokens": 116785.0,
"reward": 0.4499415159225464,
"reward_std": 0.15268102288246155,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.09809152781963348,
"rewards/belief_accuracy/std": 0.055555559694767,
"rewards/env_reward/mean": -0.09814999997615814,
"rewards/env_reward/std": 0.13192453980445862,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 9.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9.0,
"completions/max_terminated_length": 9.0,
"completions/mean_length": 9.0,
"completions/mean_terminated_length": 9.0,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.024685382843018,
"kl": 0.13551567122340202,
"learning_rate": 4.194444444444445e-05,
"loss": 0.0054,
"num_tokens": 119221.0,
"reward": 0.3680373430252075,
"reward_std": 0.0555555485188961,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.14893732964992523,
"rewards/belief_accuracy/std": 0.0555555559694767,
"rewards/env_reward/mean": -0.23090000450611115,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.06375,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.126793384552002,
"kl": 0.6491426480934024,
"learning_rate": 4.166666666666667e-05,
"loss": 0.026,
"num_tokens": 121659.0,
"reward": 0.792822003364563,
"reward_std": 0.08787091821432114,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3297470211982727,
"rewards/belief_accuracy/std": 0.0708894208073616,
"rewards/env_reward/mean": 0.013075001537799835,
"rewards/env_reward/std": 0.06034718081355095,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.065,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.94462776184082,
"kl": 0.5436751991510391,
"learning_rate": 4.138888888888889e-05,
"loss": 0.0217,
"num_tokens": 124097.0,
"reward": -0.08772659301757812,
"reward_std": 2.6784932613372803,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.11957336962223053,
"rewards/belief_accuracy/std": 0.2547529339790344,
"rewards/env_reward/mean": -0.3197999894618988,
"rewards/env_reward/std": 1.7868000268936157,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.25,
"completions/mean_terminated_length": 9.25,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.06625,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.370966911315918,
"kl": 1.4008168280124664,
"learning_rate": 4.111111111111111e-05,
"loss": 0.056,
"num_tokens": 126534.0,
"reward": 0.9119184017181396,
"reward_std": 0.18588221073150635,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.19994337856769562,
"rewards/belief_accuracy/std": 0.052378278225660324,
"rewards/env_reward/mean": 0.2619749903678894,
"rewards/env_reward/std": 0.17835001647472382,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 9.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9.0,
"completions/max_terminated_length": 9.0,
"completions/mean_length": 9.0,
"completions/mean_terminated_length": 9.0,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0675,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.3133320808410645,
"kl": 0.1781236482784152,
"learning_rate": 4.0833333333333334e-05,
"loss": 0.0071,
"num_tokens": 128642.0,
"reward": 1.097651481628418,
"reward_std": 0.16205953061580658,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3312014043331146,
"rewards/belief_accuracy/std": 0.038512155413627625,
"rewards/env_reward/mean": 0.31644999980926514,
"rewards/env_reward/std": 0.17150001227855682,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 16.75,
"completions/mean_terminated_length": 16.75,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.06875,
"frac_reward_zero_std": 0.0,
"grad_norm": 34.0543098449707,
"kl": 0.8935712603852153,
"learning_rate": 4.055555555555556e-05,
"loss": 0.0357,
"num_tokens": 131109.0,
"reward": -0.541115403175354,
"reward_std": 2.423954486846924,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.2600095570087433,
"rewards/belief_accuracy/std": 0.3070025146007538,
"rewards/env_reward/mean": -0.9136250019073486,
"rewards/env_reward/std": 1.4743117094039917,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.07,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.462577819824219,
"kl": 0.3837334793061018,
"learning_rate": 4.027777777777778e-05,
"loss": 0.0153,
"num_tokens": 133548.0,
"reward": 0.8409051895141602,
"reward_std": 0.5089951157569885,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3351301848888397,
"rewards/belief_accuracy/std": 0.03546025604009628,
"rewards/env_reward/mean": 0.055775001645088196,
"rewards/env_reward/std": 0.4801793694496155,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.07125,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.602283477783203,
"kl": 0.4018232896924019,
"learning_rate": 4e-05,
"loss": 0.0161,
"num_tokens": 135987.0,
"reward": 1.0232038497924805,
"reward_std": 0.17945493757724762,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3198787569999695,
"rewards/belief_accuracy/std": 0.06048122048377991,
"rewards/env_reward/mean": 0.2533249855041504,
"rewards/env_reward/std": 0.19902949035167694,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 16.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 16.0,
"completions/max_terminated_length": 16.0,
"completions/mean_length": 14.0,
"completions/mean_terminated_length": 14.0,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.0725,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.349449157714844,
"kl": 1.514005783945322,
"learning_rate": 3.972222222222222e-05,
"loss": 0.0606,
"num_tokens": 138443.0,
"reward": 0.7130297422409058,
"reward_std": 0.011807739734649658,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4381297826766968,
"rewards/belief_accuracy/std": 0.01180770993232727,
"rewards/env_reward/mean": -0.17509999871253967,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 9.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9.0,
"completions/max_terminated_length": 9.0,
"completions/mean_length": 9.0,
"completions/mean_terminated_length": 9.0,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.07375,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.675447702407837,
"kl": 0.2982510030269623,
"learning_rate": 3.944444444444445e-05,
"loss": 0.0119,
"num_tokens": 140371.0,
"reward": 0.7432596683502197,
"reward_std": 0.08212430030107498,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2348596453666687,
"rewards/belief_accuracy/std": 0.08212428539991379,
"rewards/env_reward/mean": 0.058400001376867294,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 11.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 10.5,
"completions/mean_terminated_length": 10.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.075,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.70500659942627,
"kl": 0.7789931371808052,
"learning_rate": 3.9166666666666665e-05,
"loss": 0.0312,
"num_tokens": 142813.0,
"reward": 0.7869753837585449,
"reward_std": 0.1802431344985962,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3499753177165985,
"rewards/belief_accuracy/std": 0.02043679915368557,
"rewards/env_reward/mean": -0.012999998405575752,
"rewards/env_reward/std": 0.165887251496315,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 18.0,
"completions/mean_terminated_length": 18.0,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.07625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.680730819702148,
"kl": 0.49687472730875015,
"learning_rate": 3.888888888888889e-05,
"loss": 0.0199,
"num_tokens": 145285.0,
"reward": 1.379298210144043,
"reward_std": 0.29271385073661804,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.48169824481010437,
"rewards/belief_accuracy/std": 0.02147861011326313,
"rewards/env_reward/mean": 0.44760000705718994,
"rewards/env_reward/std": 0.31296268105506897,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0775,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.184836387634277,
"kl": 0.6918958639726043,
"learning_rate": 3.8611111111111116e-05,
"loss": 0.0277,
"num_tokens": 147583.0,
"reward": 0.9556702375411987,
"reward_std": 0.3963484764099121,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3408702611923218,
"rewards/belief_accuracy/std": 0.10047954320907593,
"rewards/env_reward/mean": 0.1648000031709671,
"rewards/env_reward/std": 0.29756635427474976,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 16.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 16.0,
"completions/max_terminated_length": 16.0,
"completions/mean_length": 13.5,
"completions/mean_terminated_length": 13.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.07875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.352582931518555,
"kl": 0.6942682042717934,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.0278,
"num_tokens": 150037.0,
"reward": -0.10456359386444092,
"reward_std": 2.6881396770477295,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.22078636288642883,
"rewards/belief_accuracy/std": 0.28245842456817627,
"rewards/env_reward/mean": -0.4378499984741211,
"rewards/env_reward/std": 1.753225564956665,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.08,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.730816841125488,
"kl": 0.1406126618385315,
"learning_rate": 3.805555555555555e-05,
"loss": 0.0056,
"num_tokens": 152477.0,
"reward": 0.8492586016654968,
"reward_std": 0.025813313201069832,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3603585958480835,
"rewards/belief_accuracy/std": 0.02581331506371498,
"rewards/env_reward/mean": 0.03889999911189079,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.08125,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.032550811767578,
"kl": 0.7948775328695774,
"learning_rate": 3.777777777777778e-05,
"loss": 0.0318,
"num_tokens": 154916.0,
"reward": -0.33585548400878906,
"reward_std": 2.5095832347869873,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.1766444593667984,
"rewards/belief_accuracy/std": 0.251135915517807,
"rewards/env_reward/mean": -0.625,
"rewards/env_reward/std": 1.5835769176483154,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0825,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.913240432739258,
"kl": 0.2136247158050537,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0085,
"num_tokens": 157354.0,
"reward": 1.6287837028503418,
"reward_std": 0.42615801095962524,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.38048359751701355,
"rewards/belief_accuracy/std": 0.046978533267974854,
"rewards/env_reward/mean": 0.79830002784729,
"rewards/env_reward/std": 0.4454835057258606,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.08375,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.553076148033142,
"kl": 0.10467652417719364,
"learning_rate": 3.722222222222222e-05,
"loss": 0.0042,
"num_tokens": 159830.0,
"reward": 1.322935700416565,
"reward_std": 0.2903204560279846,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4757356643676758,
"rewards/belief_accuracy/std": 0.043959349393844604,
"rewards/env_reward/mean": 0.3972000181674957,
"rewards/env_reward/std": 0.3020000159740448,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.085,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.090060710906982,
"kl": 1.6307630129158497,
"learning_rate": 3.694444444444445e-05,
"loss": 0.0652,
"num_tokens": 162269.0,
"reward": 0.7854204177856445,
"reward_std": 0.4746541976928711,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.31664538383483887,
"rewards/belief_accuracy/std": 0.0875483974814415,
"rewards/env_reward/mean": 0.018775001168251038,
"rewards/env_reward/std": 0.4861546754837036,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.08625,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.485288381576538,
"kl": 0.23944057151675224,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.0096,
"num_tokens": 164709.0,
"reward": 1.390899419784546,
"reward_std": 0.020128881558775902,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.36959928274154663,
"rewards/belief_accuracy/std": 0.02012895792722702,
"rewards/env_reward/mean": 0.5713000297546387,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0875,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.18677520751953,
"kl": 1.8300514370203018,
"learning_rate": 3.638888888888889e-05,
"loss": 0.0732,
"num_tokens": 167148.0,
"reward": 0.942622184753418,
"reward_std": 0.25368720293045044,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3244222104549408,
"rewards/belief_accuracy/std": 0.043074190616607666,
"rewards/env_reward/mean": 0.16820000112056732,
"rewards/env_reward/std": 0.28401991724967957,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.08875,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.618915557861328,
"kl": 1.169862363487482,
"learning_rate": 3.611111111111111e-05,
"loss": 0.0468,
"num_tokens": 169586.0,
"reward": 0.8535665273666382,
"reward_std": 0.3577015995979309,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3871665596961975,
"rewards/belief_accuracy/std": 0.05502847209572792,
"rewards/env_reward/mean": 0.016399994492530823,
"rewards/env_reward/std": 0.3711565434932709,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.5,
"completions/mean_terminated_length": 9.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.09,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.216642379760742,
"kl": 0.6808963306248188,
"learning_rate": 3.5833333333333335e-05,
"loss": 0.0272,
"num_tokens": 171696.0,
"reward": 1.028957724571228,
"reward_std": 0.41150763630867004,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.20865774154663086,
"rewards/belief_accuracy/std": 0.03286440670490265,
"rewards/env_reward/mean": 0.3703000247478485,
"rewards/env_reward/std": 0.4340519607067108,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 21.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 20.0,
"completions/mean_terminated_length": 20.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.09125,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.145647048950195,
"kl": 0.20024005696177483,
"learning_rate": 3.555555555555556e-05,
"loss": 0.008,
"num_tokens": 174176.0,
"reward": -0.4722920060157776,
"reward_std": 1.0149987936019897,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.31225794553756714,
"rewards/belief_accuracy/std": 0.21262100338935852,
"rewards/env_reward/mean": -1.2345499992370605,
"rewards/env_reward/std": 1.2276198863983154,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0925,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.049619674682617,
"kl": 0.2335779219865799,
"learning_rate": 3.527777777777778e-05,
"loss": 0.0093,
"num_tokens": 176616.0,
"reward": 1.0211774110794067,
"reward_std": 0.14699670672416687,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2448773980140686,
"rewards/belief_accuracy/std": 0.14699672162532806,
"rewards/env_reward/mean": 0.3262999951839447,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 19.25,
"completions/mean_terminated_length": 19.25,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8269565105438232,
"kl": 0.15778795257210732,
"learning_rate": 3.5e-05,
"loss": 0.0063,
"num_tokens": 179093.0,
"reward": 0.2638227641582489,
"reward_std": 0.020399997010827065,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.1281227469444275,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": -0.314300000667572,
"rewards/env_reward/std": 0.020400002598762512,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 18.75,
"completions/mean_terminated_length": 18.75,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.095,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.611948490142822,
"kl": 0.389703668653965,
"learning_rate": 3.472222222222222e-05,
"loss": 0.0156,
"num_tokens": 181568.0,
"reward": 1.0119727849960327,
"reward_std": 0.6514222621917725,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.165622740983963,
"rewards/belief_accuracy/std": 0.07500001043081284,
"rewards/env_reward/mean": 0.3963499963283539,
"rewards/env_reward/std": 0.6052363514900208,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.09625,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.13189589977264404,
"kl": 0.16265837848186493,
"learning_rate": 3.444444444444445e-05,
"loss": 0.0065,
"num_tokens": 184008.0,
"reward": 1.5773634910583496,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4184635281562805,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.708899974822998,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0975,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.107391834259033,
"kl": 0.2844863813370466,
"learning_rate": 3.4166666666666666e-05,
"loss": 0.0114,
"num_tokens": 186448.0,
"reward": 1.0842738151550293,
"reward_std": 0.12935107946395874,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.24584877490997314,
"rewards/belief_accuracy/std": 0.06684443354606628,
"rewards/env_reward/mean": 0.38842499256134033,
"rewards/env_reward/std": 0.06755000352859497,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.09875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.824180603027344,
"kl": 0.6403497904539108,
"learning_rate": 3.388888888888889e-05,
"loss": 0.0256,
"num_tokens": 188887.0,
"reward": 1.0925590991973877,
"reward_std": 0.483914315700531,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.19068412482738495,
"rewards/belief_accuracy/std": 0.03546026349067688,
"rewards/env_reward/mean": 0.4518749713897705,
"rewards/env_reward/std": 0.4644499719142914,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.3538818359375,
"kl": 0.2827349714934826,
"learning_rate": 3.3611111111111116e-05,
"loss": 0.0113,
"num_tokens": 191327.0,
"reward": 0.9939346313476562,
"reward_std": 0.06064464524388313,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3592345714569092,
"rewards/belief_accuracy/std": 0.060644667595624924,
"rewards/env_reward/mean": 0.18469999730587006,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.10125,
"frac_reward_zero_std": 0.0,
"grad_norm": 23.832109451293945,
"kl": 2.242331273853779,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0897,
"num_tokens": 193803.0,
"reward": 0.8711313009262085,
"reward_std": 0.09817580133676529,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.25543123483657837,
"rewards/belief_accuracy/std": 0.09817582368850708,
"rewards/env_reward/mean": 0.1657000035047531,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1025,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.796640634536743,
"kl": 0.35036180168390274,
"learning_rate": 3.3055555555555553e-05,
"loss": 0.014,
"num_tokens": 196243.0,
"reward": 0.9582550525665283,
"reward_std": 0.03546029329299927,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3403550088405609,
"rewards/belief_accuracy/std": 0.03546027094125748,
"rewards/env_reward/mean": 0.1678999960422516,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 14.75,
"completions/mean_terminated_length": 14.75,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.10375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.267955780029297,
"kl": 0.7201052233576775,
"learning_rate": 3.277777777777778e-05,
"loss": 0.0288,
"num_tokens": 198702.0,
"reward": 1.0548349618911743,
"reward_std": 0.16117516160011292,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2837349474430084,
"rewards/belief_accuracy/std": 0.16117514669895172,
"rewards/env_reward/mean": 0.32109999656677246,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.105,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.409769535064697,
"kl": 0.16481813788414001,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.0066,
"num_tokens": 201178.0,
"reward": 0.8341628909111023,
"reward_std": 0.02475076913833618,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2675629258155823,
"rewards/belief_accuracy/std": 0.024750780314207077,
"rewards/env_reward/mean": 0.11659999936819077,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.10625,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.341743230819702,
"kl": 0.24941369146108627,
"learning_rate": 3.222222222222223e-05,
"loss": 0.01,
"num_tokens": 203618.0,
"reward": 0.9240851402282715,
"reward_std": 0.02967216819524765,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.43518519401550293,
"rewards/belief_accuracy/std": 0.029672183096408844,
"rewards/env_reward/mean": 0.03889999911189079,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1075,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.166858196258545,
"kl": 0.33342748135328293,
"learning_rate": 3.194444444444444e-05,
"loss": 0.0133,
"num_tokens": 206058.0,
"reward": 1.0291411876678467,
"reward_std": 0.037037014961242676,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3840411305427551,
"rewards/belief_accuracy/std": 0.037037044763565063,
"rewards/env_reward/mean": 0.19509999454021454,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 16.75,
"completions/mean_terminated_length": 16.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.10875,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.752022743225098,
"kl": 0.17901013372465968,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.0072,
"num_tokens": 208525.0,
"reward": 1.822479248046875,
"reward_std": 0.16067416965961456,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.26387926936149597,
"rewards/belief_accuracy/std": 0.16067418456077576,
"rewards/env_reward/mean": 1.1086000204086304,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.11,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.2524333000183105,
"kl": 0.32108578458428383,
"learning_rate": 3.138888888888889e-05,
"loss": 0.0128,
"num_tokens": 210965.0,
"reward": 1.3185707330703735,
"reward_std": 0.04781458154320717,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.23227064311504364,
"rewards/belief_accuracy/std": 0.04781460389494896,
"rewards/env_reward/mean": 0.6363000273704529,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.11125,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.387347936630249,
"kl": 0.5034720227122307,
"learning_rate": 3.111111111111111e-05,
"loss": 0.0201,
"num_tokens": 213029.0,
"reward": 0.9789755344390869,
"reward_std": 0.08584359288215637,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.23877549171447754,
"rewards/belief_accuracy/std": 0.08584360778331757,
"rewards/env_reward/mean": 0.29019999504089355,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.03340158984065056,
"kl": 0.26204439997673035,
"learning_rate": 3.0833333333333335e-05,
"loss": 0.0105,
"num_tokens": 215469.0,
"reward": 1.36446213722229,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3786620497703552,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.5357999801635742,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.11375,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6622090339660645,
"kl": 0.24875067919492722,
"learning_rate": 3.055555555555556e-05,
"loss": 0.0099,
"num_tokens": 217909.0,
"reward": 1.3570003509521484,
"reward_std": 0.037037014961242676,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.22560039162635803,
"rewards/belief_accuracy/std": 0.03703703731298447,
"rewards/env_reward/mean": 0.6814000010490417,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.115,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.632673263549805,
"kl": 1.328475397080183,
"learning_rate": 3.0277777777777776e-05,
"loss": 0.0531,
"num_tokens": 220348.0,
"reward": 0.8873113393783569,
"reward_std": 0.0031414825934916735,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4066362977027893,
"rewards/belief_accuracy/std": 0.03470852971076965,
"rewards/env_reward/mean": 0.030675001442432404,
"rewards/env_reward/std": 0.03785000368952751,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.11625,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.490170001983643,
"kl": 0.32097524777054787,
"learning_rate": 3e-05,
"loss": 0.0128,
"num_tokens": 222788.0,
"reward": 1.0041425228118896,
"reward_std": 0.02265910431742668,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4476425051689148,
"rewards/belief_accuracy/std": 0.022659141570329666,
"rewards/env_reward/mean": 0.10649999976158142,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1175,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.904633045196533,
"kl": 0.28252891823649406,
"learning_rate": 2.9722222222222223e-05,
"loss": 0.0113,
"num_tokens": 224648.0,
"reward": 1.3599534034729004,
"reward_std": 0.5514536499977112,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.442878395318985,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.467074990272522,
"rewards/env_reward/std": 0.5573499798774719,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 19.75,
"completions/mean_terminated_length": 19.75,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.11875,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4134881496429443,
"kl": 0.26468512788414955,
"learning_rate": 2.9444444444444448e-05,
"loss": 0.0106,
"num_tokens": 227127.0,
"reward": 0.9643936157226562,
"reward_std": 0.25506067276000977,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.34791865944862366,
"rewards/belief_accuracy/std": 0.017089322209358215,
"rewards/env_reward/mean": 0.16647499799728394,
"rewards/env_reward/std": 0.2721500098705292,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.12,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.03512604907155037,
"kl": 0.3145686089992523,
"learning_rate": 2.916666666666667e-05,
"loss": 0.0126,
"num_tokens": 229567.0,
"reward": 1.2496230602264404,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.42342302203178406,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.37619999051094055,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.12125,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.007344961166382,
"kl": 0.2418195605278015,
"learning_rate": 2.8888888888888888e-05,
"loss": 0.0097,
"num_tokens": 232007.0,
"reward": 0.9147475957870483,
"reward_std": 0.013101109303534031,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4202476143836975,
"rewards/belief_accuracy/std": 0.013101109303534031,
"rewards/env_reward/mean": 0.04450000077486038,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1225,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.023203933611512184,
"kl": 0.27170511707663536,
"learning_rate": 2.861111111111111e-05,
"loss": 0.0109,
"num_tokens": 234447.0,
"reward": 1.4455524682998657,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3073524236679077,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.6881999969482422,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.12375,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.017642363905906677,
"kl": 0.09895136207342148,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.004,
"num_tokens": 236923.0,
"reward": 1.683054804801941,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.433454692363739,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.7996000051498413,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.03634629398584366,
"kl": 0.417863130569458,
"learning_rate": 2.8055555555555557e-05,
"loss": 0.0167,
"num_tokens": 239363.0,
"reward": 0.9032934904098511,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.43159350752830505,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.021700000390410423,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 16.5,
"completions/mean_terminated_length": 16.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.12625,
"frac_reward_zero_std": 0.0,
"grad_norm": 168.017333984375,
"kl": 1.8653298392891884,
"learning_rate": 2.777777777777778e-05,
"loss": 0.0746,
"num_tokens": 241829.0,
"reward": 1.0519425868988037,
"reward_std": 0.14819319546222687,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2862425148487091,
"rewards/belief_accuracy/std": 0.14819316565990448,
"rewards/env_reward/mean": 0.3156999945640564,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1275,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.334230661392212,
"kl": 0.5032302476465702,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.0201,
"num_tokens": 244269.0,
"reward": 0.8271099328994751,
"reward_std": 0.03421219810843468,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2945099174976349,
"rewards/belief_accuracy/std": 0.034212201833724976,
"rewards/env_reward/mean": 0.08259999752044678,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.12875,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.814326286315918,
"kl": 0.3283202312886715,
"learning_rate": 2.7222222222222223e-05,
"loss": 0.0131,
"num_tokens": 246709.0,
"reward": 1.1225488185882568,
"reward_std": 0.01851852796971798,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.35364869236946106,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.3188999891281128,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 14.5,
"completions/mean_terminated_length": 14.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.13,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.209584712982178,
"kl": 0.2050139382481575,
"learning_rate": 2.6944444444444445e-05,
"loss": 0.0082,
"num_tokens": 249167.0,
"reward": 1.2894783020019531,
"reward_std": 0.008995347656309605,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.46567827463150024,
"rewards/belief_accuracy/std": 0.008995355106890202,
"rewards/env_reward/mean": 0.37380000948905945,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.13125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.03365113213658333,
"kl": 0.7158948183059692,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.0286,
"num_tokens": 251607.0,
"reward": 1.0311956405639648,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.16369564831256866,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.41749998927116394,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1325,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0734815001487732,
"kl": 0.490933895111084,
"learning_rate": 2.6388888888888892e-05,
"loss": 0.0196,
"num_tokens": 254047.0,
"reward": 0.8992230296134949,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.42342302203178406,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.025800000876188278,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 11.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 10.25,
"completions/mean_terminated_length": 10.25,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.13375,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.114912033081055,
"kl": 0.8195095211267471,
"learning_rate": 2.6111111111111114e-05,
"loss": 0.0328,
"num_tokens": 256488.0,
"reward": 1.0691735744476318,
"reward_std": 0.018518486991524696,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.35887354612350464,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.26030001044273376,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 15.5,
"completions/mean_terminated_length": 15.5,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.135,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.382083892822266,
"kl": 0.31994709372520447,
"learning_rate": 2.5833333333333336e-05,
"loss": 0.0128,
"num_tokens": 258950.0,
"reward": -0.6886433959007263,
"reward_std": 2.2765731811523438,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.14408157765865326,
"rewards/belief_accuracy/std": 0.24749916791915894,
"rewards/env_reward/mean": -0.9452250003814697,
"rewards/env_reward/std": 1.3756073713302612,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 17.0,
"completions/mean_terminated_length": 17.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.13625,
"frac_reward_zero_std": 0.0,
"grad_norm": 49.812625885009766,
"kl": 0.27156141633167863,
"learning_rate": 2.5555555555555554e-05,
"loss": 0.0109,
"num_tokens": 261418.0,
"reward": 1.521310567855835,
"reward_std": 0.3812306821346283,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4087354242801666,
"rewards/belief_accuracy/std": 0.049438536167144775,
"rewards/env_reward/mean": 0.6625750064849854,
"rewards/env_reward/std": 0.3948500156402588,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.1375,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5267906188964844,
"kl": 0.17876873165369034,
"learning_rate": 2.527777777777778e-05,
"loss": 0.0072,
"num_tokens": 263894.0,
"reward": 1.0311747789382935,
"reward_std": 0.014142122119665146,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.449174702167511,
"rewards/belief_accuracy/std": 0.01414213702082634,
"rewards/env_reward/mean": 0.13199999928474426,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.13875,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0028416195418685675,
"kl": 0.04003394767642021,
"learning_rate": 2.5e-05,
"loss": 0.0016,
"num_tokens": 266370.0,
"reward": 1.4576547145843506,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.433454692363739,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.5741999745368958,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 16.75,
"completions/mean_terminated_length": 16.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.14,
"frac_reward_zero_std": 0.0,
"grad_norm": 1877.4913330078125,
"kl": 25.26655474305153,
"learning_rate": 2.4722222222222223e-05,
"loss": 1.0107,
"num_tokens": 268837.0,
"reward": 0.9704756736755371,
"reward_std": 0.2818722426891327,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.31272566318511963,
"rewards/belief_accuracy/std": 0.15558846294879913,
"rewards/env_reward/mean": 0.20774999260902405,
"rewards/env_reward/std": 0.14230000972747803,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 12.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 10.5,
"completions/mean_terminated_length": 10.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.14125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.757664680480957,
"kl": 0.7676222324371338,
"learning_rate": 2.4444444444444445e-05,
"loss": 0.0307,
"num_tokens": 271271.0,
"reward": 0.9339514970779419,
"reward_std": 0.15512926876544952,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3786514401435852,
"rewards/belief_accuracy/std": 0.15512926876544952,
"rewards/env_reward/mean": 0.10530000180006027,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1425,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.01638762652873993,
"kl": 0.33665189146995544,
"learning_rate": 2.4166666666666667e-05,
"loss": 0.0135,
"num_tokens": 273711.0,
"reward": 1.2136327028274536,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.20073269307613373,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.5629000067710876,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.14375,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.445044040679932,
"kl": 0.3578999266028404,
"learning_rate": 2.3888888888888892e-05,
"loss": 0.0143,
"num_tokens": 276151.0,
"reward": 1.4394323825836182,
"reward_std": 0.015065747313201427,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4536323547363281,
"rewards/belief_accuracy/std": 0.015065711922943592,
"rewards/env_reward/mean": 0.5357999801635742,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.145,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.026247208938002586,
"kl": 0.5255059599876404,
"learning_rate": 2.361111111111111e-05,
"loss": 0.021,
"num_tokens": 278591.0,
"reward": 1.06673264503479,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.20073269307613373,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.41600000858306885,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 18.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 18.0,
"completions/max_terminated_length": 18.0,
"completions/mean_length": 13.5,
"completions/mean_terminated_length": 13.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.14625,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.0344319343566895,
"kl": 0.4771764427423477,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.0191,
"num_tokens": 281045.0,
"reward": 1.7174423933029175,
"reward_std": 0.06313853710889816,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4052424132823944,
"rewards/belief_accuracy/std": 0.06313848495483398,
"rewards/env_reward/mean": 0.8622000217437744,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 16.75,
"completions/mean_terminated_length": 16.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1475,
"frac_reward_zero_std": 0.0,
"grad_norm": 76.42330169677734,
"kl": 0.23224885016679764,
"learning_rate": 2.3055555555555558e-05,
"loss": 0.0093,
"num_tokens": 283512.0,
"reward": 1.405627965927124,
"reward_std": 0.06796585023403168,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4252280294895172,
"rewards/belief_accuracy/std": 0.06796582788228989,
"rewards/env_reward/mean": 0.5303999781608582,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.14875,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.012090546078979969,
"kl": 0.26135221123695374,
"learning_rate": 2.277777777777778e-05,
"loss": 0.0105,
"num_tokens": 285952.0,
"reward": 1.2826112508773804,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4255111813545227,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.40709999203681946,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.15,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.484664440155029,
"kl": 0.5242489501833916,
"learning_rate": 2.25e-05,
"loss": 0.021,
"num_tokens": 288392.0,
"reward": 0.8988614082336426,
"reward_std": 0.015609226189553738,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.40826135873794556,
"rewards/belief_accuracy/std": 0.01560924481600523,
"rewards/env_reward/mean": 0.0406000018119812,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.15125,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.3713908195495605,
"kl": 0.4266308322548866,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.0171,
"num_tokens": 290832.0,
"reward": 1.5641981363296509,
"reward_std": 0.035460252314805984,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.21689815819263458,
"rewards/belief_accuracy/std": 0.03546026349067688,
"rewards/env_reward/mean": 0.8973000049591064,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 14.5,
"completions/mean_terminated_length": 14.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1525,
"frac_reward_zero_std": 0.0,
"grad_norm": 75.12261199951172,
"kl": 0.46135450154542923,
"learning_rate": 2.1944444444444445e-05,
"loss": 0.0185,
"num_tokens": 293290.0,
"reward": 0.3108258843421936,
"reward_std": 0.05725647509098053,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.26912587881088257,
"rewards/belief_accuracy/std": 0.05725647136569023,
"rewards/env_reward/mean": -0.4083000123500824,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 11.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 10.25,
"completions/mean_terminated_length": 10.25,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.15375,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.971485137939453,
"kl": 0.45348528772592545,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.0181,
"num_tokens": 295731.0,
"reward": 1.198211669921875,
"reward_std": 0.2306346893310547,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3166116774082184,
"rewards/belief_accuracy/std": 0.018518507480621338,
"rewards/env_reward/mean": 0.43160000443458557,
"rewards/env_reward/std": 0.22380001842975616,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 17.5,
"completions/mean_terminated_length": 17.5,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.155,
"frac_reward_zero_std": 0.0,
"grad_norm": 25.937440872192383,
"kl": 0.490961529314518,
"learning_rate": 2.138888888888889e-05,
"loss": 0.0196,
"num_tokens": 298201.0,
"reward": 1.179431676864624,
"reward_std": 0.13312996923923492,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3432316482067108,
"rewards/belief_accuracy/std": 0.13312995433807373,
"rewards/env_reward/mean": 0.3862000107765198,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.3738932609558105,
"kl": 0.4266118183732033,
"learning_rate": 2.111111111111111e-05,
"loss": 0.0171,
"num_tokens": 300641.0,
"reward": 1.021052360534668,
"reward_std": 0.3019999861717224,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3073524236679077,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.263700008392334,
"rewards/env_reward/std": 0.3020000159740448,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1575,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.936905860900879,
"kl": 0.4430236220359802,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.0177,
"num_tokens": 303081.0,
"reward": 0.9750161170959473,
"reward_std": 0.0325876884162426,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.34071606397628784,
"rewards/belief_accuracy/std": 0.0325876921415329,
"rewards/env_reward/mean": 0.1843000054359436,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.15875,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.03275100514292717,
"kl": 0.2940594553947449,
"learning_rate": 2.0555555555555555e-05,
"loss": 0.0118,
"num_tokens": 305521.0,
"reward": 0.8977112174034119,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4255111813545227,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.022199999541044235,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.16,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.03054218925535679,
"kl": 0.3323657214641571,
"learning_rate": 2.027777777777778e-05,
"loss": 0.0133,
"num_tokens": 307961.0,
"reward": 0.9899099469184875,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3570099174976349,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.18289999663829803,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.16125,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7979254722595215,
"kl": 0.2442447803914547,
"learning_rate": 2e-05,
"loss": 0.0098,
"num_tokens": 310401.0,
"reward": 1.0102381706237793,
"reward_std": 0.01851852796971798,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.32913801074028015,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.23109999299049377,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 17.25,
"completions/mean_terminated_length": 17.25,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1625,
"frac_reward_zero_std": 0.0,
"grad_norm": 81.0110092163086,
"kl": 0.3928138017654419,
"learning_rate": 1.9722222222222224e-05,
"loss": 0.0157,
"num_tokens": 312870.0,
"reward": 1.5793039798736572,
"reward_std": 0.4587002098560333,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.42725393176078796,
"rewards/belief_accuracy/std": 0.012401506304740906,
"rewards/env_reward/mean": 0.702049970626831,
"rewards/env_reward/std": 0.4657484292984009,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.16375,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.06868572533130646,
"kl": 0.5432239770889282,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.0217,
"num_tokens": 315310.0,
"reward": 0.906711220741272,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4255111813545227,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.031199999153614044,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.165,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.019419007003307343,
"kl": 0.48082277178764343,
"learning_rate": 1.9166666666666667e-05,
"loss": 0.0192,
"num_tokens": 317750.0,
"reward": 1.2387326955795288,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.20073269307613373,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.5879999995231628,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 16.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 16.0,
"completions/max_terminated_length": 16.0,
"completions/mean_length": 13.5,
"completions/mean_terminated_length": 13.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.16625,
"frac_reward_zero_std": 0.0,
"grad_norm": 88.00366973876953,
"kl": 1.527749940752983,
"learning_rate": 1.888888888888889e-05,
"loss": 0.0611,
"num_tokens": 320204.0,
"reward": 2.232346773147583,
"reward_std": 0.7095935940742493,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.37157177925109863,
"rewards/belief_accuracy/std": 0.1919439733028412,
"rewards/env_reward/mean": 1.4107749462127686,
"rewards/env_reward/std": 0.745449960231781,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1675,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.016352592036128044,
"kl": 0.43096375465393066,
"learning_rate": 1.861111111111111e-05,
"loss": 0.0172,
"num_tokens": 322644.0,
"reward": 1.5087559223175049,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.28115594387054443,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.7775999903678894,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.16875,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.042022716253995895,
"kl": 0.4099663197994232,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.0164,
"num_tokens": 325084.0,
"reward": 1.783276081085205,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.24467593431472778,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 1.0886000394821167,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 16.75,
"completions/mean_terminated_length": 16.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.17,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.926192760467529,
"kl": 0.3908762261271477,
"learning_rate": 1.8055555555555555e-05,
"loss": 0.0156,
"num_tokens": 327551.0,
"reward": 1.8185992240905762,
"reward_std": 0.004498743452131748,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.47779929637908936,
"rewards/belief_accuracy/std": 0.004498769994825125,
"rewards/env_reward/mean": 0.8907999992370605,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.17125,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6473772525787354,
"kl": 0.22215565294027328,
"learning_rate": 1.777777777777778e-05,
"loss": 0.0089,
"num_tokens": 330027.0,
"reward": 2.0623245239257812,
"reward_std": 0.014299631118774414,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.47632455825805664,
"rewards/belief_accuracy/std": 0.014299660921096802,
"rewards/env_reward/mean": 1.1360000371932983,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1725,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4564273357391357,
"kl": 0.5230707004666328,
"learning_rate": 1.75e-05,
"loss": 0.0209,
"num_tokens": 332275.0,
"reward": 0.8312469720840454,
"reward_std": 0.04355309158563614,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4006469249725342,
"rewards/belief_accuracy/std": 0.04355309531092644,
"rewards/env_reward/mean": -0.01940000057220459,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.17375,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.3987679481506348,
"kl": 0.11173266544938087,
"learning_rate": 1.7222222222222224e-05,
"loss": 0.0045,
"num_tokens": 334751.0,
"reward": 0.670301616191864,
"reward_std": 0.1809031218290329,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3405015766620636,
"rewards/belief_accuracy/std": 0.18090307712554932,
"rewards/env_reward/mean": -0.12020000070333481,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.175,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6056313514709473,
"kl": 0.3194083571434021,
"learning_rate": 1.6944444444444446e-05,
"loss": 0.0128,
"num_tokens": 337191.0,
"reward": 1.473215103149414,
"reward_std": 0.018518567085266113,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2904151976108551,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.7328000068664551,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.17625,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.011740055866539478,
"kl": 0.3923957645893097,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0157,
"num_tokens": 339631.0,
"reward": 0.9226230382919312,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.42342302203178406,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.04919999837875366,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1775,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.64264178276062,
"kl": 0.41386187076568604,
"learning_rate": 1.638888888888889e-05,
"loss": 0.0166,
"num_tokens": 342071.0,
"reward": 0.9231228828430176,
"reward_std": 0.018518507480621338,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.380022794008255,
"rewards/belief_accuracy/std": 0.018518507480621338,
"rewards/env_reward/mean": 0.09309999644756317,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.17875,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.626895785331726,
"kl": 0.317696675658226,
"learning_rate": 1.6111111111111115e-05,
"loss": 0.0127,
"num_tokens": 344511.0,
"reward": 1.314621925354004,
"reward_std": 0.01851852796971798,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4132220447063446,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.4514000117778778,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 19.5,
"completions/mean_terminated_length": 19.5,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.18,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.746852159500122,
"kl": 0.10339224711060524,
"learning_rate": 1.5833333333333333e-05,
"loss": 0.0041,
"num_tokens": 346989.0,
"reward": 1.2542515993118286,
"reward_std": 0.20949891209602356,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4730015695095062,
"rewards/belief_accuracy/std": 0.04833333194255829,
"rewards/env_reward/mean": 0.33124998211860657,
"rewards/env_reward/std": 0.23365363478660583,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.18125,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7674238681793213,
"kl": 0.21675339713692665,
"learning_rate": 1.5555555555555555e-05,
"loss": 0.0087,
"num_tokens": 349425.0,
"reward": 0.8702070116996765,
"reward_std": 0.037037014961242676,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.46380698680877686,
"rewards/belief_accuracy/std": 0.03703702986240387,
"rewards/env_reward/mean": -0.04360000044107437,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1825,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0504443384706974,
"kl": 0.5085489749908447,
"learning_rate": 1.527777777777778e-05,
"loss": 0.0203,
"num_tokens": 351357.0,
"reward": 1.031711220741272,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4255111813545227,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.15620000660419464,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.18375,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.02717801183462143,
"kl": 0.48331472277641296,
"learning_rate": 1.5e-05,
"loss": 0.0193,
"num_tokens": 353797.0,
"reward": 1.2608559131622314,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.28115594387054443,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.529699981212616,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 16.75,
"completions/mean_terminated_length": 16.75,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.185,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.648240089416504,
"kl": 0.40032833255827427,
"learning_rate": 1.4722222222222224e-05,
"loss": 0.016,
"num_tokens": 356264.0,
"reward": 1.746770977973938,
"reward_std": 0.5813379287719727,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2962958812713623,
"rewards/belief_accuracy/std": 0.19808298349380493,
"rewards/env_reward/mean": 1.0004750490188599,
"rewards/env_reward/std": 0.5624499917030334,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 18.0,
"completions/mean_terminated_length": 18.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.18625,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.142242908477783,
"kl": 0.28697329107671976,
"learning_rate": 1.4444444444444444e-05,
"loss": 0.0115,
"num_tokens": 358736.0,
"reward": 0.43696850538253784,
"reward_std": 3.024721622467041,
"rewards/action_legal/mean": 0.125,
"rewards/action_legal/std": 0.75,
"rewards/belief_accuracy/mean": 0.31194350123405457,
"rewards/belief_accuracy/std": 0.34197041392326355,
"rewards/env_reward/mean": 0.012525022029876709,
"rewards/env_reward/std": 2.008349895477295,
"rewards/format_valid/mean": 0.25,
"rewards/format_valid/std": 1.5,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.6960530281066895,
"kl": 0.5598425641655922,
"learning_rate": 1.4166666666666668e-05,
"loss": 0.0224,
"num_tokens": 361176.0,
"reward": 1.3302440643310547,
"reward_std": 0.011125604622066021,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4280441105365753,
"rewards/belief_accuracy/std": 0.011125624179840088,
"rewards/env_reward/mean": 0.4521999955177307,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.18875,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.022711308673024178,
"kl": 0.3234124183654785,
"learning_rate": 1.388888888888889e-05,
"loss": 0.0129,
"num_tokens": 363616.0,
"reward": 1.4567558765411377,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.28115594387054443,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.725600004196167,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 8.0,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.19,
"frac_reward_zero_std": 0.0,
"grad_norm": 87.73027038574219,
"kl": 0.8339565396308899,
"learning_rate": 1.3611111111111111e-05,
"loss": 0.0334,
"num_tokens": 366048.0,
"reward": -1.5026097297668457,
"reward_std": 2.999246120452881,
"rewards/action_legal/mean": -0.25,
"rewards/action_legal/std": 0.8660253882408142,
"rewards/belief_accuracy/mean": 0.044990174472332,
"rewards/belief_accuracy/std": 0.28329408168792725,
"rewards/env_reward/mean": -1.32260000705719,
"rewards/env_reward/std": 1.9368946552276611,
"rewards/format_valid/mean": -0.5,
"rewards/format_valid/std": 1.7320507764816284,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.19125,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2898941040039062,
"kl": 0.4663423076272011,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0187,
"num_tokens": 368488.0,
"reward": 0.9306638240814209,
"reward_std": 0.018518507480621338,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4141637682914734,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.06650000065565109,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1925,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.02326902747154236,
"kl": 0.4144258499145508,
"learning_rate": 1.3055555555555557e-05,
"loss": 0.0166,
"num_tokens": 370420.0,
"reward": 1.5288934707641602,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.43159350752830505,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.6473000049591064,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 15.0,
"completions/mean_terminated_length": 15.0,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.19375,
"frac_reward_zero_std": 0.0,
"grad_norm": 35.10660171508789,
"kl": 0.4181790351867676,
"learning_rate": 1.2777777777777777e-05,
"loss": 0.0167,
"num_tokens": 372880.0,
"reward": 1.6037037372589111,
"reward_std": 0.9571843147277832,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2437037080526352,
"rewards/belief_accuracy/std": 0.29311609268188477,
"rewards/env_reward/mean": 0.9100000262260437,
"rewards/env_reward/std": 0.664068341255188,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.195,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.00849384069442749,
"kl": 0.05927176773548126,
"learning_rate": 1.25e-05,
"loss": 0.0024,
"num_tokens": 375356.0,
"reward": 1.6517739295959473,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.49777403473854065,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.7039999961853027,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.19625,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.01303598191589117,
"kl": 0.38526836037635803,
"learning_rate": 1.2222222222222222e-05,
"loss": 0.0154,
"num_tokens": 377796.0,
"reward": 1.3590935468673706,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.43159350752830505,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.47749999165534973,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1975,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.026443013921380043,
"kl": 0.4196889400482178,
"learning_rate": 1.1944444444444446e-05,
"loss": 0.0168,
"num_tokens": 380236.0,
"reward": 0.9207112193107605,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4255111813545227,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.04520000144839287,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.19875,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1190378665924072,
"kl": 0.40614624321460724,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.0162,
"num_tokens": 382676.0,
"reward": 1.4092152118682861,
"reward_std": 0.018518486991524696,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2904151976108551,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.6687999963760376,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.06362934410572052,
"kl": 0.16141445934772491,
"learning_rate": 1.138888888888889e-05,
"loss": 0.0065,
"num_tokens": 385152.0,
"reward": 1.6058154106140137,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4977153241634369,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.6581000089645386,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.20125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.02474510669708252,
"kl": 0.33240804076194763,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.0133,
"num_tokens": 387592.0,
"reward": 0.913393497467041,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.43159350752830505,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.03180000185966492,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2025,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.015563507564365864,
"kl": 0.36354348063468933,
"learning_rate": 1.0833333333333334e-05,
"loss": 0.0145,
"num_tokens": 390032.0,
"reward": 1.2903891801834106,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.25078916549682617,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.5896000266075134,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.20375,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4728994369506836,
"kl": 0.3627878651022911,
"learning_rate": 1.0555555555555555e-05,
"loss": 0.0145,
"num_tokens": 392472.0,
"reward": 1.258821964263916,
"reward_std": 0.01851852796971798,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4132220447063446,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.39559999108314514,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.205,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.23980188369751,
"kl": 0.2861982248723507,
"learning_rate": 1.0277777777777777e-05,
"loss": 0.0114,
"num_tokens": 394912.0,
"reward": 1.0106734037399292,
"reward_std": 0.011475821025669575,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.41977328062057495,
"rewards/belief_accuracy/std": 0.011475772596895695,
"rewards/env_reward/mean": 0.14090000092983246,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.20625,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.021868258714675903,
"kl": 0.39394283294677734,
"learning_rate": 1e-05,
"loss": 0.0158,
"num_tokens": 397352.0,
"reward": 0.9160443544387817,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3023443818092346,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.16369999945163727,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2075,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.604858636856079,
"kl": 0.6312557309865952,
"learning_rate": 9.722222222222223e-06,
"loss": 0.0253,
"num_tokens": 399080.0,
"reward": 0.9826427698135376,
"reward_std": 0.03478008508682251,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.35074275732040405,
"rewards/belief_accuracy/std": 0.034780099987983704,
"rewards/env_reward/mean": 0.1818999946117401,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.20875,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.561725616455078,
"kl": 1.2403500601649284,
"learning_rate": 9.444444444444445e-06,
"loss": 0.0496,
"num_tokens": 400808.0,
"reward": 1.0310287475585938,
"reward_std": 0.03295842558145523,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3198787569999695,
"rewards/belief_accuracy/std": 0.03024062141776085,
"rewards/env_reward/mean": 0.2611500024795532,
"rewards/env_reward/std": 0.013105858117341995,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 18.25,
"completions/mean_terminated_length": 18.25,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.21,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.700604438781738,
"kl": 0.3083411678671837,
"learning_rate": 9.166666666666666e-06,
"loss": 0.0123,
"num_tokens": 403281.0,
"reward": 1.0681264400482178,
"reward_std": 0.008333325386047363,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4922264814376831,
"rewards/belief_accuracy/std": 0.008333340287208557,
"rewards/env_reward/mean": 0.125900000333786,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.21125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.016459409147500992,
"kl": 0.4519377648830414,
"learning_rate": 8.88888888888889e-06,
"loss": 0.0181,
"num_tokens": 405721.0,
"reward": 1.4217381477355957,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4364381432533264,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.5353000164031982,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 20.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 19.5,
"completions/mean_terminated_length": 19.5,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2125,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.101553916931152,
"kl": 0.4495365619659424,
"learning_rate": 8.611111111111112e-06,
"loss": 0.018,
"num_tokens": 408199.0,
"reward": 1.400689959526062,
"reward_std": 0.5047602653503418,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4664399027824402,
"rewards/belief_accuracy/std": 0.03808803856372833,
"rewards/env_reward/mean": 0.484250009059906,
"rewards/env_reward/std": 0.46667224168777466,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.21375,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.00956246443092823,
"kl": 0.3575763404369354,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0143,
"num_tokens": 410639.0,
"reward": 1.0793381929397583,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4364381432533264,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.19290000200271606,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.215,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.3567819595336914,
"kl": 0.45299722999334335,
"learning_rate": 8.055555555555557e-06,
"loss": 0.0181,
"num_tokens": 413079.0,
"reward": 1.4047298431396484,
"reward_std": 0.01851852796971798,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2415299117565155,
"rewards/belief_accuracy/std": 0.018518514931201935,
"rewards/env_reward/mean": 0.7131999731063843,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 13.75,
"completions/mean_terminated_length": 13.75,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.21625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.245166778564453,
"kl": 0.3327266201376915,
"learning_rate": 7.777777777777777e-06,
"loss": 0.0133,
"num_tokens": 415530.0,
"reward": 0.7125999927520752,
"reward_std": 0.04879846051335335,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.45499998331069946,
"rewards/belief_accuracy/std": 0.04879846051335335,
"rewards/env_reward/mean": -0.1923999935388565,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2175,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.04452874884009361,
"kl": 0.37988075613975525,
"learning_rate": 7.5e-06,
"loss": 0.0152,
"num_tokens": 417970.0,
"reward": 0.9200787544250488,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3198787569999695,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.1501999944448471,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.498856544494629,
"kl": 0.4394312873482704,
"learning_rate": 7.222222222222222e-06,
"loss": 0.0176,
"num_tokens": 420034.0,
"reward": 1.2926833629608154,
"reward_std": 0.01282203197479248,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4527832865715027,
"rewards/belief_accuracy/std": 0.012822061777114868,
"rewards/env_reward/mean": 0.38989999890327454,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.22,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.023712527006864548,
"kl": 0.3908577561378479,
"learning_rate": 6.944444444444445e-06,
"loss": 0.0156,
"num_tokens": 422474.0,
"reward": 1.4205559492111206,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.28115594387054443,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.6894000172615051,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.22125,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7105811834335327,
"kl": 0.3014954552054405,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0121,
"num_tokens": 424914.0,
"reward": 0.9700735807418823,
"reward_std": 0.018518507480621338,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.35887354612350464,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.16120000183582306,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 19.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 19.0,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2225,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.1105613261461258,
"kl": 0.10958900302648544,
"learning_rate": 6.3888888888888885e-06,
"loss": 0.0044,
"num_tokens": 427390.0,
"reward": 2.8349690437316895,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.49926915764808655,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 1.885699987411499,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.22375,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3042142391204834,
"kl": 0.4740464314818382,
"learning_rate": 6.111111111111111e-06,
"loss": 0.019,
"num_tokens": 429830.0,
"reward": 1.3142220973968506,
"reward_std": 0.018518486991524696,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4132220447063446,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.45100000500679016,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.225,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.963607311248779,
"kl": 2.186647579073906,
"learning_rate": 5.833333333333334e-06,
"loss": 0.0875,
"num_tokens": 432269.0,
"reward": 1.2146563529968262,
"reward_std": 0.2969000041484833,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.16290634870529175,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.6017500162124634,
"rewards/env_reward/std": 0.2969000041484833,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.22625,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004897190723568201,
"kl": 0.4025897979736328,
"learning_rate": 5.555555555555556e-06,
"loss": 0.0161,
"num_tokens": 434709.0,
"reward": 1.3257989883422852,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.30849888920783997,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.567300021648407,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2275,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.025149773806333542,
"kl": 0.3142901659011841,
"learning_rate": 5.277777777777778e-06,
"loss": 0.0126,
"num_tokens": 437149.0,
"reward": 1.1127787828445435,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3198787569999695,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.34290000796318054,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.22875,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.714738130569458,
"kl": 0.3641320765018463,
"learning_rate": 5e-06,
"loss": 0.0146,
"num_tokens": 439009.0,
"reward": 1.2621725797653198,
"reward_std": 0.6376118659973145,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4503726065158844,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.3617999851703644,
"rewards/env_reward/std": 0.6311999559402466,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.23,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.011211546137928963,
"kl": 0.3627232015132904,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0145,
"num_tokens": 441449.0,
"reward": 1.176652431488037,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3073524236679077,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.41929998993873596,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.23125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.02891082689166069,
"kl": 0.314136266708374,
"learning_rate": 4.444444444444445e-06,
"loss": 0.0126,
"num_tokens": 443749.0,
"reward": 1.165998935699463,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.30849888920783997,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.4074999988079071,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2325,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006373639218509197,
"kl": 0.3533334732055664,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0141,
"num_tokens": 446189.0,
"reward": 1.1543989181518555,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.30849888920783997,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.39590001106262207,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.23375,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004680258687585592,
"kl": 0.36052942276000977,
"learning_rate": 3.888888888888889e-06,
"loss": 0.0144,
"num_tokens": 448629.0,
"reward": 0.9413381814956665,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4364381432533264,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.05490000173449516,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 11.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 10.25,
"completions/mean_terminated_length": 10.25,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.235,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.884200096130371,
"kl": 0.7835219278931618,
"learning_rate": 3.611111111111111e-06,
"loss": 0.0313,
"num_tokens": 451066.0,
"reward": 1.0154476165771484,
"reward_std": 0.15778863430023193,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.285047709941864,
"rewards/belief_accuracy/std": 0.15778866410255432,
"rewards/env_reward/mean": 0.28040000796318054,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.23625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.003281593322754,
"kl": 0.4188368022441864,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0168,
"num_tokens": 453506.0,
"reward": 1.8497289419174194,
"reward_std": 0.03703709691762924,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3920290470123291,
"rewards/belief_accuracy/std": 0.03703702986240387,
"rewards/env_reward/mean": 1.007699966430664,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 21.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 19.5,
"completions/mean_terminated_length": 19.5,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.276236534118652,
"kl": 1.8405264019966125,
"learning_rate": 3.0555555555555556e-06,
"loss": 0.0736,
"num_tokens": 455984.0,
"reward": 0.15131822228431702,
"reward_std": 0.37554997205734253,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.4963931441307068,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": -0.7950749397277832,
"rewards/env_reward/std": 0.3755500018596649,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.23875,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.917500972747803,
"kl": 0.35147424787282944,
"learning_rate": 2.777777777777778e-06,
"loss": 0.0141,
"num_tokens": 458284.0,
"reward": 1.2944972515106201,
"reward_std": 0.037037014961242676,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3383972644805908,
"rewards/belief_accuracy/std": 0.03703702986240387,
"rewards/env_reward/mean": 0.5060999989509583,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 9.75,
"completions/mean_terminated_length": 9.75,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.24,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.188007354736328,
"kl": 0.5920824334025383,
"learning_rate": 2.5e-06,
"loss": 0.0237,
"num_tokens": 460395.0,
"reward": 1.291003704071045,
"reward_std": 0.019749999046325684,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3198787569999695,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.5211250185966492,
"rewards/env_reward/std": 0.019750013947486877,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.24125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.019955696538090706,
"kl": 0.41180703043937683,
"learning_rate": 2.2222222222222225e-06,
"loss": 0.0165,
"num_tokens": 462835.0,
"reward": 0.9298934936523438,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.43159350752830505,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.04830000177025795,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2425,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.3022541999816895,
"kl": 0.3642955869436264,
"learning_rate": 1.9444444444444444e-06,
"loss": 0.0146,
"num_tokens": 465275.0,
"reward": 0.8369070291519165,
"reward_std": 0.02138333022594452,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.46380698680877686,
"rewards/belief_accuracy/std": 0.02138333022594452,
"rewards/env_reward/mean": -0.07689999788999557,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.24375,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.005025716498494148,
"kl": 0.5623220801353455,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0225,
"num_tokens": 467715.0,
"reward": 0.9521230459213257,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.42342302203178406,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.0786999985575676,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 11.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 10.25,
"completions/mean_terminated_length": 10.25,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.245,
"frac_reward_zero_std": 0.0,
"grad_norm": 21.385347366333008,
"kl": 4.466357246041298,
"learning_rate": 1.388888888888889e-06,
"loss": 0.1786,
"num_tokens": 470156.0,
"reward": 1.1828665733337402,
"reward_std": 0.00026782354689203203,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.20086660981178284,
"rewards/belief_accuracy/std": 0.0002678185701370239,
"rewards/env_reward/mean": 0.5320000052452087,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.24625,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4839556217193604,
"kl": 0.8131380379199982,
"learning_rate": 1.1111111111111112e-06,
"loss": 0.0325,
"num_tokens": 472220.0,
"reward": 0.8117702007293701,
"reward_std": 0.07407406717538834,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.43227022886276245,
"rewards/belief_accuracy/std": 0.07407407462596893,
"rewards/env_reward/mean": -0.07050000131130219,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2475,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.125584602355957,
"kl": 1.1076267883181572,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0443,
"num_tokens": 474660.0,
"reward": 0.9507396221160889,
"reward_std": 0.01851852796971798,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.2992396354675293,
"rewards/belief_accuracy/std": 0.018518522381782532,
"rewards/env_reward/mean": 0.20149999856948853,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.24875,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.013783140107989311,
"kl": 0.36227384209632874,
"learning_rate": 5.555555555555556e-07,
"loss": 0.0145,
"num_tokens": 477100.0,
"reward": 0.8893821239471436,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3892820477485657,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.05009999871253967,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 10.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 10.0,
"completions/mean_terminated_length": 10.0,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.25,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.012795673683285713,
"kl": 0.5199949741363525,
"learning_rate": 2.777777777777778e-07,
"loss": 0.0208,
"num_tokens": 479540.0,
"reward": 0.9321444034576416,
"reward_std": 0.0,
"rewards/action_legal/mean": 0.5,
"rewards/action_legal/std": 0.0,
"rewards/belief_accuracy/mean": 0.3023443818092346,
"rewards/belief_accuracy/std": 0.0,
"rewards/env_reward/mean": 0.17980000376701355,
"rewards/env_reward/std": 0.0,
"rewards/format_valid/mean": 1.0,
"rewards/format_valid/std": 0.0,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 479540,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}