13235 lines
475 KiB
JSON
13235 lines
475 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.3333333333333333,
|
|
"eval_steps": 500,
|
|
"global_step": 400,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0008333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0297584533691406,
|
|
"kl": 0.007555788848549128,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0003,
|
|
"num_tokens": 2438.0,
|
|
"reward": 0.5746050477027893,
|
|
"reward_std": 0.7637181282043457,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.16745585203170776,
|
|
"rewards/belief_accuracy/std": 0.03546026349067688,
|
|
"rewards/env_reward/mean": 0.014825001358985901,
|
|
"rewards/env_reward/std": 0.5229976177215576,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 9.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0016666666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 487.28997802734375,
|
|
"kl": 0.36721258889883757,
|
|
"learning_rate": 1.25e-06,
|
|
"loss": 0.0147,
|
|
"num_tokens": 4754.0,
|
|
"reward": 1.6277785301208496,
|
|
"reward_std": 0.3852787911891937,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37001365423202515,
|
|
"rewards/belief_accuracy/std": 0.06095712259411812,
|
|
"rewards/env_reward/mean": 0.3118250072002411,
|
|
"rewards/env_reward/std": 0.26973089575767517,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.092467308044434,
|
|
"kl": 0.014069308177568018,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 0.0006,
|
|
"num_tokens": 7195.0,
|
|
"reward": -0.9344245195388794,
|
|
"reward_std": 2.935027599334717,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.03502512723207474,
|
|
"rewards/belief_accuracy/std": 0.24171829223632812,
|
|
"rewards/env_reward/mean": -0.6929999589920044,
|
|
"rewards/env_reward/std": 1.5391119718551636,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0033333333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 70.31130981445312,
|
|
"kl": 0.16654532926622778,
|
|
"learning_rate": 3.75e-06,
|
|
"loss": 0.0067,
|
|
"num_tokens": 9641.0,
|
|
"reward": 0.40939950942993164,
|
|
"reward_std": 0.6117480397224426,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.04012066870927811,
|
|
"rewards/belief_accuracy/std": 0.20485758781433105,
|
|
"rewards/env_reward/mean": 0.16352500021457672,
|
|
"rewards/env_reward/std": 0.12513433396816254,
|
|
"rewards/format_valid/mean": 0.875,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.004166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.75938606262207,
|
|
"kl": 0.005745900256442837,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 12081.0,
|
|
"reward": 1.2169301509857178,
|
|
"reward_std": 0.5240868330001831,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2165767103433609,
|
|
"rewards/belief_accuracy/std": 0.21178469061851501,
|
|
"rewards/env_reward/mean": 0.3447999954223633,
|
|
"rewards/env_reward/std": 0.1597922146320343,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.055194854736328,
|
|
"kl": 0.0013933435111539438,
|
|
"learning_rate": 6.25e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14378.0,
|
|
"reward": 0.8722533583641052,
|
|
"reward_std": 0.5229519605636597,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.17082196474075317,
|
|
"rewards/belief_accuracy/std": 0.06307334452867508,
|
|
"rewards/env_reward/mean": 0.20652499794960022,
|
|
"rewards/env_reward/std": 0.31724998354911804,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.005833333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.596080780029297,
|
|
"kl": 0.003983344242442399,
|
|
"learning_rate": 7.5e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 16817.0,
|
|
"reward": 0.3588276505470276,
|
|
"reward_std": 0.5061821937561035,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.07134255766868591,
|
|
"rewards/belief_accuracy/std": 0.08802108466625214,
|
|
"rewards/env_reward/mean": 0.06319999694824219,
|
|
"rewards/env_reward/std": 0.1783815622329712,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.006666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.14266300201416,
|
|
"kl": 0.0045957728871144354,
|
|
"learning_rate": 8.75e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 19254.0,
|
|
"reward": 0.9281585216522217,
|
|
"reward_std": 0.641987144947052,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.25081950426101685,
|
|
"rewards/belief_accuracy/std": 0.09036833792924881,
|
|
"rewards/env_reward/mean": 0.08379998803138733,
|
|
"rewards/env_reward/std": 0.4169001281261444,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.244797229766846,
|
|
"kl": 0.004008802643511444,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 21694.0,
|
|
"reward": -0.8402208089828491,
|
|
"reward_std": 2.993889331817627,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.08231388032436371,
|
|
"rewards/belief_accuracy/std": 0.22892379760742188,
|
|
"rewards/env_reward/mean": -0.7247750163078308,
|
|
"rewards/env_reward/std": 1.522688627243042,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 14.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.008333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.427257537841797,
|
|
"kl": 0.028211204218678176,
|
|
"learning_rate": 1.125e-05,
|
|
"loss": 0.0011,
|
|
"num_tokens": 24151.0,
|
|
"reward": 0.732832670211792,
|
|
"reward_std": 0.2891167104244232,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1579858809709549,
|
|
"rewards/belief_accuracy/std": 0.0883205309510231,
|
|
"rewards/env_reward/mean": 0.1392499953508377,
|
|
"rewards/env_reward/std": 0.016723934561014175,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.009166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1559700965881348,
|
|
"kl": 0.005673486019077245,
|
|
"learning_rate": 1.25e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 25878.0,
|
|
"reward": -0.2948700189590454,
|
|
"reward_std": 3.3204774856567383,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.16303499042987823,
|
|
"rewards/belief_accuracy/std": 0.24976226687431335,
|
|
"rewards/env_reward/mean": -0.5226500034332275,
|
|
"rewards/env_reward/std": 1.6547037363052368,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.764303207397461,
|
|
"kl": 0.004842391535930801,
|
|
"learning_rate": 1.3750000000000002e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 27605.0,
|
|
"reward": 0.6823896169662476,
|
|
"reward_std": 0.017910229042172432,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22252154350280762,
|
|
"rewards/belief_accuracy/std": 0.036789700388908386,
|
|
"rewards/env_reward/mean": -0.023450002074241638,
|
|
"rewards/env_reward/std": 0.07271481305360794,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.010833333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.114142894744873,
|
|
"kl": 0.004877378873061389,
|
|
"learning_rate": 1.5e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 30061.0,
|
|
"reward": 0.8373932242393494,
|
|
"reward_std": 0.4389983117580414,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.23391437530517578,
|
|
"rewards/belief_accuracy/std": 0.1463327705860138,
|
|
"rewards/env_reward/mean": 0.057100001722574234,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.011666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0929784774780273,
|
|
"kl": 0.0014771804580959724,
|
|
"learning_rate": 1.6250000000000002e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 32501.0,
|
|
"reward": 0.20685642957687378,
|
|
"reward_std": 0.6949325203895569,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.19772300124168396,
|
|
"rewards/belief_accuracy/std": 0.1775362491607666,
|
|
"rewards/env_reward/mean": -0.29087501764297485,
|
|
"rewards/env_reward/std": 0.7713783979415894,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.996659755706787,
|
|
"kl": 0.007067542013828643,
|
|
"learning_rate": 1.75e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 34939.0,
|
|
"reward": 0.9863791465759277,
|
|
"reward_std": 0.1628992110490799,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.18266388773918152,
|
|
"rewards/belief_accuracy/std": 0.0645347312092781,
|
|
"rewards/env_reward/mean": 0.25892502069473267,
|
|
"rewards/env_reward/std": 0.1236146092414856,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.013333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.10272216796875,
|
|
"kl": 0.004334542165452149,
|
|
"learning_rate": 1.8750000000000002e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 37377.0,
|
|
"reward": 0.9267517924308777,
|
|
"reward_std": 0.1699640452861786,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.16352561116218567,
|
|
"rewards/belief_accuracy/std": 0.09197874367237091,
|
|
"rewards/env_reward/mean": 0.2574499845504761,
|
|
"rewards/env_reward/std": 0.2165336161851883,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.014166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.821376800537109,
|
|
"kl": 0.004073493357282132,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 39815.0,
|
|
"reward": -1.199587106704712,
|
|
"reward_std": 2.7017223834991455,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.08026264607906342,
|
|
"rewards/belief_accuracy/std": 0.1924673169851303,
|
|
"rewards/env_reward/mean": -0.9602500200271606,
|
|
"rewards/env_reward/std": 1.3646483421325684,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.039498805999756,
|
|
"kl": 0.003853246627841145,
|
|
"learning_rate": 2.125e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 42253.0,
|
|
"reward": 1.1887301206588745,
|
|
"reward_std": 0.5090645551681519,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1981642246246338,
|
|
"rewards/belief_accuracy/std": 0.08274012058973312,
|
|
"rewards/env_reward/mean": 0.3628250062465668,
|
|
"rewards/env_reward/std": 0.42794182896614075,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.015833333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.75765609741211,
|
|
"kl": 0.002594503777800128,
|
|
"learning_rate": 2.25e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 44693.0,
|
|
"reward": 1.2963204383850098,
|
|
"reward_std": 1.6085225343704224,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1160651445388794,
|
|
"rewards/belief_accuracy/std": 0.10867781937122345,
|
|
"rewards/env_reward/mean": 0.5987499952316284,
|
|
"rewards/env_reward/std": 0.9246196746826172,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.016666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.9152791500091553,
|
|
"kl": 0.0029762745389234624,
|
|
"learning_rate": 2.375e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 46624.0,
|
|
"reward": 0.6017731428146362,
|
|
"reward_std": 0.15383398532867432,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.09383687376976013,
|
|
"rewards/belief_accuracy/std": 0.03703703731298447,
|
|
"rewards/env_reward/mean": 0.18017500638961792,
|
|
"rewards/env_reward/std": 0.15538449585437775,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.0644450187683105,
|
|
"kl": 0.00437403135583736,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 49066.0,
|
|
"reward": -0.5255992412567139,
|
|
"reward_std": 3.1553311347961426,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.122025266289711,
|
|
"rewards/belief_accuracy/std": 0.2217501848936081,
|
|
"rewards/env_reward/mean": -0.5944499969482422,
|
|
"rewards/env_reward/std": 1.6077580451965332,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.018333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.2595951557159424,
|
|
"kl": 0.0019797176646534353,
|
|
"learning_rate": 2.625e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 51505.0,
|
|
"reward": 0.4400880038738251,
|
|
"reward_std": 0.5773349404335022,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.17279182374477386,
|
|
"rewards/belief_accuracy/std": 0.07431290298700333,
|
|
"rewards/env_reward/mean": -0.08552499860525131,
|
|
"rewards/env_reward/std": 0.2797339856624603,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 9.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.019166666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 736.62890625,
|
|
"kl": 3.031015553511679,
|
|
"learning_rate": 2.7500000000000004e-05,
|
|
"loss": 0.1212,
|
|
"num_tokens": 53966.0,
|
|
"reward": 1.050592064857483,
|
|
"reward_std": 0.2744295299053192,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2777556777000427,
|
|
"rewards/belief_accuracy/std": 0.10405799001455307,
|
|
"rewards/env_reward/mean": 0.1115499958395958,
|
|
"rewards/env_reward/std": 0.04209999740123749,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.14186954498291,
|
|
"kl": 0.01767307601403445,
|
|
"learning_rate": 2.8749999999999997e-05,
|
|
"loss": 0.0007,
|
|
"num_tokens": 56407.0,
|
|
"reward": -0.10755039006471634,
|
|
"reward_std": 1.0050357580184937,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.16075819730758667,
|
|
"rewards/belief_accuracy/std": 0.09638070315122604,
|
|
"rewards/env_reward/mean": -0.4265500009059906,
|
|
"rewards/env_reward/std": 0.7103733420372009,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.020833333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.760839462280273,
|
|
"kl": 0.004387545719509944,
|
|
"learning_rate": 3e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 58846.0,
|
|
"reward": -0.6063781976699829,
|
|
"reward_std": 3.0987460613250732,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.08613643050193787,
|
|
"rewards/belief_accuracy/std": 0.20087628066539764,
|
|
"rewards/env_reward/mean": -0.5765249729156494,
|
|
"rewards/env_reward/std": 1.6289700269699097,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.021666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.131656646728516,
|
|
"kl": 0.006817424291511998,
|
|
"learning_rate": 3.125e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 61285.0,
|
|
"reward": 1.4750633239746094,
|
|
"reward_std": 0.6433582305908203,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22258360683918,
|
|
"rewards/belief_accuracy/std": 0.0710342675447464,
|
|
"rewards/env_reward/mean": 0.5048750042915344,
|
|
"rewards/env_reward/std": 0.30701497197151184,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.655338764190674,
|
|
"kl": 0.011261592793744057,
|
|
"learning_rate": 3.2500000000000004e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 63724.0,
|
|
"reward": 1.4208177328109741,
|
|
"reward_std": 1.19350266456604,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.21906425058841705,
|
|
"rewards/belief_accuracy/std": 0.18924623727798462,
|
|
"rewards/env_reward/mean": 0.47574999928474426,
|
|
"rewards/env_reward/std": 0.4241226613521576,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 9.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.023333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 35.8610725402832,
|
|
"kl": 4.056387651711702,
|
|
"learning_rate": 3.375000000000001e-05,
|
|
"loss": 0.1623,
|
|
"num_tokens": 65857.0,
|
|
"reward": 0.2565092444419861,
|
|
"reward_std": 0.9514979124069214,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22687393426895142,
|
|
"rewards/belief_accuracy/std": 0.07754969596862793,
|
|
"rewards/env_reward/mean": -0.31607499718666077,
|
|
"rewards/env_reward/std": 0.7444266080856323,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.024166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.129375696182251,
|
|
"kl": 0.06665351914125495,
|
|
"learning_rate": 3.5e-05,
|
|
"loss": 0.0027,
|
|
"num_tokens": 68295.0,
|
|
"reward": 1.5676257610321045,
|
|
"reward_std": 0.5401614904403687,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.23782525956630707,
|
|
"rewards/belief_accuracy/std": 0.16049730777740479,
|
|
"rewards/env_reward/mean": 0.5361000299453735,
|
|
"rewards/env_reward/std": 0.2634325325489044,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.313047885894775,
|
|
"kl": 0.09575654342916096,
|
|
"learning_rate": 3.625e-05,
|
|
"loss": 0.0038,
|
|
"num_tokens": 70592.0,
|
|
"reward": 1.385237693786621,
|
|
"reward_std": 0.23877514898777008,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.25660422444343567,
|
|
"rewards/belief_accuracy/std": 0.030240608379244804,
|
|
"rewards/env_reward/mean": 0.3769500255584717,
|
|
"rewards/env_reward/std": 0.1939132809638977,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.025833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 302.0821838378906,
|
|
"kl": 0.17512649775017053,
|
|
"learning_rate": 3.7500000000000003e-05,
|
|
"loss": 0.007,
|
|
"num_tokens": 73036.0,
|
|
"reward": -0.8282430171966553,
|
|
"reward_std": 2.9490480422973633,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.08568151295185089,
|
|
"rewards/belief_accuracy/std": 0.19252440333366394,
|
|
"rewards/env_reward/mean": -0.7235249876976013,
|
|
"rewards/env_reward/std": 1.5176500082015991,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.02666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 78.20838165283203,
|
|
"kl": 9.407063644379377,
|
|
"learning_rate": 3.875e-05,
|
|
"loss": 0.3763,
|
|
"num_tokens": 75498.0,
|
|
"reward": -1.4438053369522095,
|
|
"reward_std": 2.5607211589813232,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.07945596426725388,
|
|
"rewards/belief_accuracy/std": 0.0959252119064331,
|
|
"rewards/env_reward/mean": -0.8036249876022339,
|
|
"rewards/env_reward/std": 1.5019314289093018,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0275,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.190960168838501,
|
|
"kl": 0.06879310176009312,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.0028,
|
|
"num_tokens": 77225.0,
|
|
"reward": 1.5751994848251343,
|
|
"reward_std": 0.5590165257453918,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.25149983167648315,
|
|
"rewards/belief_accuracy/std": 0.07094359397888184,
|
|
"rewards/env_reward/mean": 0.5138000249862671,
|
|
"rewards/env_reward/std": 0.23183931410312653,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 18.75,
|
|
"completions/mean_terminated_length": 14.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.028333333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 28.640579223632812,
|
|
"kl": 0.42012222670018673,
|
|
"learning_rate": 4.125e-05,
|
|
"loss": 0.0168,
|
|
"num_tokens": 79700.0,
|
|
"reward": 0.3387056589126587,
|
|
"reward_std": 3.7822914123535156,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.2497768998146057,
|
|
"rewards/belief_accuracy/std": 0.30190309882164,
|
|
"rewards/env_reward/mean": -0.2737500071525574,
|
|
"rewards/env_reward/std": 1.855374813079834,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.029166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.623953342437744,
|
|
"kl": 0.14891700155567378,
|
|
"learning_rate": 4.25e-05,
|
|
"loss": 0.006,
|
|
"num_tokens": 82138.0,
|
|
"reward": 0.45824581384658813,
|
|
"reward_std": 0.7270535230636597,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.25250691175460815,
|
|
"rewards/belief_accuracy/std": 0.1582489311695099,
|
|
"rewards/env_reward/mean": -0.23285000026226044,
|
|
"rewards/env_reward/std": 0.23289161920547485,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.508930683135986,
|
|
"kl": 0.2548879988025874,
|
|
"learning_rate": 4.375e-05,
|
|
"loss": 0.0102,
|
|
"num_tokens": 84578.0,
|
|
"reward": 0.4626871347427368,
|
|
"reward_std": 0.8850942254066467,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24804989993572235,
|
|
"rewards/belief_accuracy/std": 0.12259609997272491,
|
|
"rewards/env_reward/mean": -0.22097501158714294,
|
|
"rewards/env_reward/std": 0.6560747623443604,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.030833333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.627114295959473,
|
|
"kl": 0.132804719673004,
|
|
"learning_rate": 4.5e-05,
|
|
"loss": 0.0053,
|
|
"num_tokens": 87019.0,
|
|
"reward": -2.2696728706359863,
|
|
"reward_std": 3.4521307945251465,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.0548660047352314,
|
|
"rewards/belief_accuracy/std": 0.19056659936904907,
|
|
"rewards/env_reward/mean": -1.3700499534606934,
|
|
"rewards/env_reward/std": 1.8821041584014893,
|
|
"rewards/format_valid/mean": -0.5,
|
|
"rewards/format_valid/std": 1.7320507764816284,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.03166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.246164321899414,
|
|
"kl": 0.2298137085745111,
|
|
"learning_rate": 4.6250000000000006e-05,
|
|
"loss": 0.0092,
|
|
"num_tokens": 89457.0,
|
|
"reward": 1.1744983196258545,
|
|
"reward_std": 0.3355043828487396,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.29197028279304504,
|
|
"rewards/belief_accuracy/std": 0.12365185469388962,
|
|
"rewards/env_reward/mean": 0.1657249927520752,
|
|
"rewards/env_reward/std": 0.03991043195128441,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 9.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0325,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.651698112487793,
|
|
"kl": 2.4256066400557756,
|
|
"learning_rate": 4.75e-05,
|
|
"loss": 0.097,
|
|
"num_tokens": 91918.0,
|
|
"reward": -0.7721501588821411,
|
|
"reward_std": 1.5534520149230957,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.13362497091293335,
|
|
"rewards/belief_accuracy/std": 0.1170458197593689,
|
|
"rewards/env_reward/mean": -0.8153500556945801,
|
|
"rewards/env_reward/std": 0.9287000298500061,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.03333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.508624076843262,
|
|
"kl": 0.12869007809786126,
|
|
"learning_rate": 4.875e-05,
|
|
"loss": 0.0051,
|
|
"num_tokens": 93644.0,
|
|
"reward": 1.710510492324829,
|
|
"reward_std": 0.9325205683708191,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3421034812927246,
|
|
"rewards/belief_accuracy/std": 0.03489655256271362,
|
|
"rewards/env_reward/mean": 0.4228000044822693,
|
|
"rewards/env_reward/std": 0.6864694952964783,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.034166666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.437275409698486,
|
|
"kl": 0.23907954257447273,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0096,
|
|
"num_tokens": 96082.0,
|
|
"reward": 1.1177518367767334,
|
|
"reward_std": 0.4233185946941376,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1830797791481018,
|
|
"rewards/belief_accuracy/std": 0.0974152609705925,
|
|
"rewards/env_reward/mean": 0.345674991607666,
|
|
"rewards/env_reward/std": 0.2988019585609436,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.035,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.4014458656311035,
|
|
"kl": 0.1710746451281011,
|
|
"learning_rate": 4.986111111111111e-05,
|
|
"loss": 0.0068,
|
|
"num_tokens": 98197.0,
|
|
"reward": -0.028865892440080643,
|
|
"reward_std": 1.0444624423980713,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.05849887803196907,
|
|
"rewards/belief_accuracy/std": 0.0555555559694767,
|
|
"rewards/env_reward/mean": -0.16957500576972961,
|
|
"rewards/env_reward/std": 0.7329205870628357,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 9.0,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.035833333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1398186683654785,
|
|
"kl": 0.044276596046984196,
|
|
"learning_rate": 4.972222222222223e-05,
|
|
"loss": 0.0018,
|
|
"num_tokens": 100633.0,
|
|
"reward": 0.06939443945884705,
|
|
"reward_std": 0.36057770252227783,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.18461482226848602,
|
|
"rewards/belief_accuracy/std": 0.1464960277080536,
|
|
"rewards/env_reward/mean": -0.3562999963760376,
|
|
"rewards/env_reward/std": 0.1712000072002411,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.03666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.34901237487793,
|
|
"kl": 0.3707070527598262,
|
|
"learning_rate": 4.958333333333334e-05,
|
|
"loss": 0.0148,
|
|
"num_tokens": 103070.0,
|
|
"reward": 0.9979052543640137,
|
|
"reward_std": 0.08536310493946075,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28115594387054443,
|
|
"rewards/belief_accuracy/std": 0.12468524277210236,
|
|
"rewards/env_reward/mean": 0.06962499022483826,
|
|
"rewards/env_reward/std": 0.2918499708175659,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1270384788513184,
|
|
"kl": 0.27486721728928387,
|
|
"learning_rate": 4.9444444444444446e-05,
|
|
"loss": 0.011,
|
|
"num_tokens": 105367.0,
|
|
"reward": 1.3586945533752441,
|
|
"reward_std": 0.09579288214445114,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2644440233707428,
|
|
"rewards/belief_accuracy/std": 0.05163230374455452,
|
|
"rewards/env_reward/mean": 0.34357500076293945,
|
|
"rewards/env_reward/std": 0.07905000448226929,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.03833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.598420143127441,
|
|
"kl": 0.35064559197053313,
|
|
"learning_rate": 4.930555555555556e-05,
|
|
"loss": 0.014,
|
|
"num_tokens": 107809.0,
|
|
"reward": -0.5861924886703491,
|
|
"reward_std": 3.1335580348968506,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.06579001247882843,
|
|
"rewards/belief_accuracy/std": 0.18941286206245422,
|
|
"rewards/env_reward/mean": -0.5223749876022339,
|
|
"rewards/env_reward/std": 1.6578657627105713,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.03916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6913492679595947,
|
|
"kl": 0.18329114187508821,
|
|
"learning_rate": 4.9166666666666665e-05,
|
|
"loss": 0.0073,
|
|
"num_tokens": 110248.0,
|
|
"reward": 1.4434185028076172,
|
|
"reward_std": 0.21208810806274414,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.302872896194458,
|
|
"rewards/belief_accuracy/std": 0.07409624010324478,
|
|
"rewards/env_reward/mean": 0.3232000172138214,
|
|
"rewards/env_reward/std": 0.1860000044107437,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.04,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.727506160736084,
|
|
"kl": 0.6681761667132378,
|
|
"learning_rate": 4.902777777777778e-05,
|
|
"loss": 0.0267,
|
|
"num_tokens": 112690.0,
|
|
"reward": 1.4063067436218262,
|
|
"reward_std": 0.5394194722175598,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.23232725262641907,
|
|
"rewards/belief_accuracy/std": 0.16305728256702423,
|
|
"rewards/env_reward/mean": 0.4395500123500824,
|
|
"rewards/env_reward/std": 0.4166736900806427,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.04083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.620209217071533,
|
|
"kl": 0.32542235124856234,
|
|
"learning_rate": 4.888888888888889e-05,
|
|
"loss": 0.013,
|
|
"num_tokens": 115127.0,
|
|
"reward": 0.45589596033096313,
|
|
"reward_std": 0.40303459763526917,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2432861626148224,
|
|
"rewards/belief_accuracy/std": 0.13326476514339447,
|
|
"rewards/env_reward/mean": -0.21597501635551453,
|
|
"rewards/env_reward/std": 0.0036500021815299988,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 14.5,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.041666666666666664,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.720799446105957,
|
|
"kl": 1.7742180861532688,
|
|
"learning_rate": 4.875e-05,
|
|
"loss": 0.071,
|
|
"num_tokens": 117585.0,
|
|
"reward": -0.6876518726348877,
|
|
"reward_std": 3.0439605712890625,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.14229519665241241,
|
|
"rewards/belief_accuracy/std": 0.23121023178100586,
|
|
"rewards/env_reward/mean": -0.743025004863739,
|
|
"rewards/env_reward/std": 1.5051672458648682,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0425,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.957500457763672,
|
|
"kl": 0.17545690201222897,
|
|
"learning_rate": 4.8611111111111115e-05,
|
|
"loss": 0.007,
|
|
"num_tokens": 119516.0,
|
|
"reward": 1.8904452323913574,
|
|
"reward_std": 0.34035390615463257,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3019692301750183,
|
|
"rewards/belief_accuracy/std": 0.07230889797210693,
|
|
"rewards/env_reward/mean": 0.6230250000953674,
|
|
"rewards/env_reward/std": 0.24345001578330994,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.043333333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.6418776512146,
|
|
"kl": 0.3459396343678236,
|
|
"learning_rate": 4.8472222222222224e-05,
|
|
"loss": 0.0138,
|
|
"num_tokens": 121954.0,
|
|
"reward": 0.47558823227882385,
|
|
"reward_std": 0.17540638148784637,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24578773975372314,
|
|
"rewards/belief_accuracy/std": 0.12300436198711395,
|
|
"rewards/env_reward/mean": -0.20785000920295715,
|
|
"rewards/env_reward/std": 0.14000745117664337,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.04416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.052071571350098,
|
|
"kl": 0.3293638424947858,
|
|
"learning_rate": 4.8333333333333334e-05,
|
|
"loss": 0.0132,
|
|
"num_tokens": 124393.0,
|
|
"reward": 1.111875057220459,
|
|
"reward_std": 0.417526513338089,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.34039586782455444,
|
|
"rewards/belief_accuracy/std": 0.07877691090106964,
|
|
"rewards/env_reward/mean": 0.027125000953674316,
|
|
"rewards/env_reward/std": 0.1343500018119812,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.045,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.564804553985596,
|
|
"kl": 0.12563097476959229,
|
|
"learning_rate": 4.819444444444445e-05,
|
|
"loss": 0.005,
|
|
"num_tokens": 126833.0,
|
|
"reward": -0.37219369411468506,
|
|
"reward_std": 3.258995294570923,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.1694604456424713,
|
|
"rewards/belief_accuracy/std": 0.25362879037857056,
|
|
"rewards/env_reward/mean": -0.5870499610900879,
|
|
"rewards/env_reward/std": 1.6088296175003052,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.04583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 35.32516860961914,
|
|
"kl": 0.4959399476647377,
|
|
"learning_rate": 4.805555555555556e-05,
|
|
"loss": 0.0198,
|
|
"num_tokens": 129266.0,
|
|
"reward": 0.5137417316436768,
|
|
"reward_std": 1.1124905347824097,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1392139196395874,
|
|
"rewards/belief_accuracy/std": 0.24298755824565887,
|
|
"rewards/env_reward/mean": 0.0349000059068203,
|
|
"rewards/env_reward/std": 0.3493712842464447,
|
|
"rewards/format_valid/mean": 0.875,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.04666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5689568519592285,
|
|
"kl": 0.5865896865725517,
|
|
"learning_rate": 4.791666666666667e-05,
|
|
"loss": 0.0235,
|
|
"num_tokens": 131704.0,
|
|
"reward": 0.5574407577514648,
|
|
"reward_std": 0.4707197844982147,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.20113441348075867,
|
|
"rewards/belief_accuracy/std": 0.06771832704544067,
|
|
"rewards/env_reward/mean": -0.0639750063419342,
|
|
"rewards/env_reward/std": 0.37699612975120544,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0475,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.44246768951416,
|
|
"kl": 0.3799317330121994,
|
|
"learning_rate": 4.7777777777777784e-05,
|
|
"loss": 0.0152,
|
|
"num_tokens": 134142.0,
|
|
"reward": 0.5855016708374023,
|
|
"reward_std": 0.5364238023757935,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24740056693553925,
|
|
"rewards/belief_accuracy/std": 0.19466152787208557,
|
|
"rewards/env_reward/mean": -0.13779999315738678,
|
|
"rewards/env_reward/std": 0.4736187756061554,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.04833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.86322021484375,
|
|
"kl": 0.3641611896455288,
|
|
"learning_rate": 4.7638888888888887e-05,
|
|
"loss": 0.0146,
|
|
"num_tokens": 136072.0,
|
|
"reward": 1.1454858779907227,
|
|
"reward_std": 0.34598207473754883,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2346869856119156,
|
|
"rewards/belief_accuracy/std": 0.0894273892045021,
|
|
"rewards/env_reward/mean": 0.2609499990940094,
|
|
"rewards/env_reward/std": 0.06991129368543625,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 9.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.049166666666666664,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 45.849605560302734,
|
|
"kl": 2.8015664890408516,
|
|
"learning_rate": 4.75e-05,
|
|
"loss": 0.1121,
|
|
"num_tokens": 138533.0,
|
|
"reward": 1.7629446983337402,
|
|
"reward_std": 0.3849683701992035,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32485654950141907,
|
|
"rewards/belief_accuracy/std": 0.14582888782024384,
|
|
"rewards/env_reward/mean": 0.4922500252723694,
|
|
"rewards/env_reward/std": 0.1800999939441681,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.147298336029053,
|
|
"kl": 0.15783802792429924,
|
|
"learning_rate": 4.736111111111111e-05,
|
|
"loss": 0.0063,
|
|
"num_tokens": 140974.0,
|
|
"reward": 1.0111374855041504,
|
|
"reward_std": 1.9640129804611206,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22329160571098328,
|
|
"rewards/belief_accuracy/std": 0.2285340428352356,
|
|
"rewards/env_reward/mean": 0.19417500495910645,
|
|
"rewards/env_reward/std": 0.9323619604110718,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.050833333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.107398509979248,
|
|
"kl": 0.6699433820322156,
|
|
"learning_rate": 4.722222222222222e-05,
|
|
"loss": 0.0268,
|
|
"num_tokens": 143417.0,
|
|
"reward": 0.7414954900741577,
|
|
"reward_std": 0.12289471924304962,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2869609594345093,
|
|
"rewards/belief_accuracy/std": 0.03287558630108833,
|
|
"rewards/env_reward/mean": -0.11292499303817749,
|
|
"rewards/env_reward/std": 0.02424999698996544,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.051666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9671472311019897,
|
|
"kl": 0.21522311307489872,
|
|
"learning_rate": 4.708333333333334e-05,
|
|
"loss": 0.0086,
|
|
"num_tokens": 145857.0,
|
|
"reward": 1.8655295372009277,
|
|
"reward_std": 0.22047176957130432,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1922764927148819,
|
|
"rewards/belief_accuracy/std": 0.07349060475826263,
|
|
"rewards/env_reward/mean": 0.8258000016212463,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 9.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0525,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 46.615821838378906,
|
|
"kl": 7.5339599046856165,
|
|
"learning_rate": 4.6944444444444446e-05,
|
|
"loss": 0.3014,
|
|
"num_tokens": 148317.0,
|
|
"reward": -0.0844120979309082,
|
|
"reward_std": 3.4652018547058105,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.16348761320114136,
|
|
"rewards/belief_accuracy/std": 0.24438007175922394,
|
|
"rewards/env_reward/mean": -0.38324999809265137,
|
|
"rewards/env_reward/std": 1.7579199075698853,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.05333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.313786506652832,
|
|
"kl": 0.31290814094245434,
|
|
"learning_rate": 4.6805555555555556e-05,
|
|
"loss": 0.0125,
|
|
"num_tokens": 150756.0,
|
|
"reward": 1.4420485496520996,
|
|
"reward_std": 0.30519527196884155,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2965036630630493,
|
|
"rewards/belief_accuracy/std": 0.015631647780537605,
|
|
"rewards/env_reward/mean": 0.33502501249313354,
|
|
"rewards/env_reward/std": 0.1946970373392105,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.05416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.699985980987549,
|
|
"kl": 0.27691997960209846,
|
|
"learning_rate": 4.666666666666667e-05,
|
|
"loss": 0.0111,
|
|
"num_tokens": 153195.0,
|
|
"reward": 0.6549558639526367,
|
|
"reward_std": 0.4356115162372589,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.09438945353031158,
|
|
"rewards/belief_accuracy/std": 0.07635381072759628,
|
|
"rewards/env_reward/mean": 0.2145249992609024,
|
|
"rewards/env_reward/std": 0.4134778380393982,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.055,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.48840594291687,
|
|
"kl": 0.23591649159789085,
|
|
"learning_rate": 4.652777777777778e-05,
|
|
"loss": 0.0094,
|
|
"num_tokens": 155634.0,
|
|
"reward": 1.1451208591461182,
|
|
"reward_std": 0.143072709441185,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2777777910232544,
|
|
"rewards/belief_accuracy/std": 0.06283816695213318,
|
|
"rewards/env_reward/mean": 0.17452499270439148,
|
|
"rewards/env_reward/std": 0.19420935213565826,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.05583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.279496669769287,
|
|
"kl": 0.2593200672417879,
|
|
"learning_rate": 4.638888888888889e-05,
|
|
"loss": 0.0104,
|
|
"num_tokens": 158072.0,
|
|
"reward": 0.5202434062957764,
|
|
"reward_std": 0.3491542339324951,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.15712279081344604,
|
|
"rewards/belief_accuracy/std": 0.0946710854768753,
|
|
"rewards/env_reward/mean": -0.0007500015199184418,
|
|
"rewards/env_reward/std": 0.0485551580786705,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.056666666666666664,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.2196364402771,
|
|
"kl": 0.22027479112148285,
|
|
"learning_rate": 4.6250000000000006e-05,
|
|
"loss": 0.0088,
|
|
"num_tokens": 160511.0,
|
|
"reward": 1.1925969123840332,
|
|
"reward_std": 0.4943404197692871,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2660156190395355,
|
|
"rewards/belief_accuracy/std": 0.03546026349067688,
|
|
"rewards/env_reward/mean": 0.2296999990940094,
|
|
"rewards/env_reward/std": 0.3610000014305115,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0575,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0931131839752197,
|
|
"kl": 0.31468209251761436,
|
|
"learning_rate": 4.6111111111111115e-05,
|
|
"loss": 0.0126,
|
|
"num_tokens": 162950.0,
|
|
"reward": 1.4288655519485474,
|
|
"reward_std": 0.21090544760227203,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2787468433380127,
|
|
"rewards/belief_accuracy/std": 0.07509276270866394,
|
|
"rewards/env_reward/mean": 0.3617500066757202,
|
|
"rewards/env_reward/std": 0.12350000441074371,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.058333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.71396017074585,
|
|
"kl": 1.3337756432592869,
|
|
"learning_rate": 4.5972222222222225e-05,
|
|
"loss": 0.0534,
|
|
"num_tokens": 165395.0,
|
|
"reward": 1.2500306367874146,
|
|
"reward_std": 0.5057908892631531,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.30136024951934814,
|
|
"rewards/belief_accuracy/std": 0.09320782870054245,
|
|
"rewards/env_reward/mean": 0.1973000019788742,
|
|
"rewards/env_reward/std": 0.24144788086414337,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.059166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5546116828918457,
|
|
"kl": 0.4766240194439888,
|
|
"learning_rate": 4.5833333333333334e-05,
|
|
"loss": 0.0191,
|
|
"num_tokens": 167834.0,
|
|
"reward": 0.970757007598877,
|
|
"reward_std": 0.37021511793136597,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24463149905204773,
|
|
"rewards/belief_accuracy/std": 0.12909035384655,
|
|
"rewards/env_reward/mean": 0.12457499653100967,
|
|
"rewards/env_reward/std": 0.02005000039935112,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.06,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.085524559020996,
|
|
"kl": 1.051513284444809,
|
|
"learning_rate": 4.569444444444444e-05,
|
|
"loss": 0.0421,
|
|
"num_tokens": 170280.0,
|
|
"reward": 1.3046754598617554,
|
|
"reward_std": 0.1669362485408783,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.191875159740448,
|
|
"rewards/belief_accuracy/std": 0.0556454099714756,
|
|
"rewards/env_reward/mean": 0.4526999890804291,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.060833333333333336,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.69377326965332,
|
|
"kl": 1.1864948254078627,
|
|
"learning_rate": 4.555555555555556e-05,
|
|
"loss": 0.0475,
|
|
"num_tokens": 172602.0,
|
|
"reward": -2.122437000274658,
|
|
"reward_std": 3.624743938446045,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": 0.07886260747909546,
|
|
"rewards/belief_accuracy/std": 0.32305219769477844,
|
|
"rewards/env_reward/mean": -1.5393500328063965,
|
|
"rewards/env_reward/std": 1.6937329769134521,
|
|
"rewards/format_valid/mean": -0.5,
|
|
"rewards/format_valid/std": 1.7320507764816284,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.06166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4644877910614014,
|
|
"kl": 0.1901670265942812,
|
|
"learning_rate": 4.541666666666667e-05,
|
|
"loss": 0.0076,
|
|
"num_tokens": 175042.0,
|
|
"reward": 1.3600037097930908,
|
|
"reward_std": 0.1827382892370224,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24918042123317719,
|
|
"rewards/belief_accuracy/std": 0.07092051953077316,
|
|
"rewards/env_reward/mean": 0.3749750256538391,
|
|
"rewards/env_reward/std": 0.08854999393224716,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.083766937255859,
|
|
"kl": 0.6269553881138563,
|
|
"learning_rate": 4.527777777777778e-05,
|
|
"loss": 0.0251,
|
|
"num_tokens": 177484.0,
|
|
"reward": -0.3148350715637207,
|
|
"reward_std": 3.2907352447509766,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.25676751136779785,
|
|
"rewards/belief_accuracy/std": 0.30544862151145935,
|
|
"rewards/env_reward/mean": -0.7234249711036682,
|
|
"rewards/env_reward/std": 1.5177685022354126,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.06333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.858494997024536,
|
|
"kl": 1.4018143992871046,
|
|
"learning_rate": 4.5138888888888894e-05,
|
|
"loss": 0.0561,
|
|
"num_tokens": 179923.0,
|
|
"reward": 1.0085630416870117,
|
|
"reward_std": 0.5302945375442505,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1935710310935974,
|
|
"rewards/belief_accuracy/std": 0.07880179584026337,
|
|
"rewards/env_reward/mean": 0.2519000172615051,
|
|
"rewards/env_reward/std": 0.2782000005245209,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.06416666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.091435432434082,
|
|
"kl": 0.4507467746734619,
|
|
"learning_rate": 4.5e-05,
|
|
"loss": 0.018,
|
|
"num_tokens": 182362.0,
|
|
"reward": 0.719524621963501,
|
|
"reward_std": 0.5247438549995422,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32113736867904663,
|
|
"rewards/belief_accuracy/std": 0.06240236759185791,
|
|
"rewards/env_reward/mean": -0.19592499732971191,
|
|
"rewards/env_reward/std": 0.4110499918460846,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.242192268371582,
|
|
"kl": 0.6336696594953537,
|
|
"learning_rate": 4.486111111111111e-05,
|
|
"loss": 0.0253,
|
|
"num_tokens": 184802.0,
|
|
"reward": 1.1715035438537598,
|
|
"reward_std": 0.5318323373794556,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31365954875946045,
|
|
"rewards/belief_accuracy/std": 0.16403073072433472,
|
|
"rewards/env_reward/mean": 0.12035000324249268,
|
|
"rewards/env_reward/std": 0.0738430991768837,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.06583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0246036052703857,
|
|
"kl": 0.21958505548536777,
|
|
"learning_rate": 4.472222222222223e-05,
|
|
"loss": 0.0088,
|
|
"num_tokens": 187242.0,
|
|
"reward": 0.20573002099990845,
|
|
"reward_std": 0.39767947793006897,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.18856000900268555,
|
|
"rewards/belief_accuracy/std": 0.034971851855516434,
|
|
"rewards/env_reward/mean": -0.2732999920845032,
|
|
"rewards/env_reward/std": 0.2833635210990906,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.06666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1923185586929321,
|
|
"kl": 0.3896181844174862,
|
|
"learning_rate": 4.458333333333334e-05,
|
|
"loss": 0.0156,
|
|
"num_tokens": 189682.0,
|
|
"reward": 0.9259074926376343,
|
|
"reward_std": 0.12676748633384705,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.366219162940979,
|
|
"rewards/belief_accuracy/std": 0.04225583001971245,
|
|
"rewards/env_reward/mean": -0.148499995470047,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0675,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7216500639915466,
|
|
"kl": 0.3197862319648266,
|
|
"learning_rate": 4.4444444444444447e-05,
|
|
"loss": 0.0128,
|
|
"num_tokens": 192122.0,
|
|
"reward": 0.9888283014297485,
|
|
"reward_std": 0.10651447623968124,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2893427610397339,
|
|
"rewards/belief_accuracy/std": 0.03550482541322708,
|
|
"rewards/env_reward/mean": 0.047200001776218414,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.06833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.201247453689575,
|
|
"kl": 0.8603321500122547,
|
|
"learning_rate": 4.4305555555555556e-05,
|
|
"loss": 0.0344,
|
|
"num_tokens": 194232.0,
|
|
"reward": 0.7670385241508484,
|
|
"reward_std": 0.4939156472682953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31966283917427063,
|
|
"rewards/belief_accuracy/std": 0.07682497054338455,
|
|
"rewards/env_reward/mean": -0.16130000352859497,
|
|
"rewards/env_reward/std": 0.2710742652416229,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.06916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1376465559005737,
|
|
"kl": 0.2951663341373205,
|
|
"learning_rate": 4.4166666666666665e-05,
|
|
"loss": 0.0118,
|
|
"num_tokens": 196672.0,
|
|
"reward": 1.633320927619934,
|
|
"reward_std": 0.29003316164016724,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2325861155986786,
|
|
"rewards/belief_accuracy/std": 0.10147262364625931,
|
|
"rewards/env_reward/mean": 0.5903750061988831,
|
|
"rewards/env_reward/std": 0.039749979972839355,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.07,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.477323532104492,
|
|
"kl": 0.8052617497742176,
|
|
"learning_rate": 4.402777777777778e-05,
|
|
"loss": 0.0322,
|
|
"num_tokens": 199124.0,
|
|
"reward": 1.5077629089355469,
|
|
"reward_std": 0.10474622994661331,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.340520977973938,
|
|
"rewards/belief_accuracy/std": 0.03491540253162384,
|
|
"rewards/env_reward/mean": 0.290800005197525,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.07083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.133931875228882,
|
|
"kl": 0.46640536189079285,
|
|
"learning_rate": 4.388888888888889e-05,
|
|
"loss": 0.0187,
|
|
"num_tokens": 201564.0,
|
|
"reward": 1.811617374420166,
|
|
"reward_std": 0.4033048748970032,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.30618077516555786,
|
|
"rewards/belief_accuracy/std": 0.11218413710594177,
|
|
"rewards/env_reward/mean": 0.5620499849319458,
|
|
"rewards/env_reward/std": 0.06709998846054077,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.07166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.235334038734436,
|
|
"kl": 0.34326545894145966,
|
|
"learning_rate": 4.375e-05,
|
|
"loss": 0.0137,
|
|
"num_tokens": 204004.0,
|
|
"reward": 1.235045313835144,
|
|
"reward_std": 0.25660011172294617,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.17671510577201843,
|
|
"rewards/belief_accuracy/std": 0.08553336560726166,
|
|
"rewards/env_reward/mean": 0.436599999666214,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0725,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.338106393814087,
|
|
"kl": 0.4051658548414707,
|
|
"learning_rate": 4.3611111111111116e-05,
|
|
"loss": 0.0162,
|
|
"num_tokens": 206303.0,
|
|
"reward": 1.0716049671173096,
|
|
"reward_std": 0.33843618631362915,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28272244334220886,
|
|
"rewards/belief_accuracy/std": 0.06825492531061172,
|
|
"rewards/env_reward/mean": 0.11562499403953552,
|
|
"rewards/env_reward/std": 0.19865000247955322,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.07333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.827055931091309,
|
|
"kl": 1.6250885128974915,
|
|
"learning_rate": 4.3472222222222225e-05,
|
|
"loss": 0.065,
|
|
"num_tokens": 208765.0,
|
|
"reward": 1.3299674987792969,
|
|
"reward_std": 0.1669912487268448,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3536558151245117,
|
|
"rewards/belief_accuracy/std": 0.055663757026195526,
|
|
"rewards/env_reward/mean": 0.1459999978542328,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.07416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.412042260169983,
|
|
"kl": 0.633812952786684,
|
|
"learning_rate": 4.3333333333333334e-05,
|
|
"loss": 0.0254,
|
|
"num_tokens": 211205.0,
|
|
"reward": 1.214733600616455,
|
|
"reward_std": 0.11018647253513336,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31789451837539673,
|
|
"rewards/belief_accuracy/std": 0.03672884404659271,
|
|
"rewards/env_reward/mean": 0.14069999754428864,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7908711433410645,
|
|
"kl": 0.5902487859129906,
|
|
"learning_rate": 4.319444444444445e-05,
|
|
"loss": 0.0236,
|
|
"num_tokens": 213645.0,
|
|
"reward": 0.9393336176872253,
|
|
"reward_std": 0.3621884286403656,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.272844523191452,
|
|
"rewards/belief_accuracy/std": 0.1207294762134552,
|
|
"rewards/env_reward/mean": 0.047200001776218414,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.07583333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.704280853271484,
|
|
"kl": 1.1360323503613472,
|
|
"learning_rate": 4.305555555555556e-05,
|
|
"loss": 0.0454,
|
|
"num_tokens": 216093.0,
|
|
"reward": 1.6989027261734009,
|
|
"reward_std": 0.16673420369625092,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.20553426444530487,
|
|
"rewards/belief_accuracy/std": 0.05557806044816971,
|
|
"rewards/env_reward/mean": 0.6881999969482422,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.07666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.743035793304443,
|
|
"kl": 0.7067995071411133,
|
|
"learning_rate": 4.291666666666667e-05,
|
|
"loss": 0.0283,
|
|
"num_tokens": 218204.0,
|
|
"reward": 1.5070440769195557,
|
|
"reward_std": 0.43784603476524353,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24255633354187012,
|
|
"rewards/belief_accuracy/std": 0.10362043231725693,
|
|
"rewards/env_reward/mean": 0.48625001311302185,
|
|
"rewards/env_reward/std": 0.08950001001358032,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0775,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.3156030178070068,
|
|
"kl": 0.2781078703701496,
|
|
"learning_rate": 4.277777777777778e-05,
|
|
"loss": 0.0111,
|
|
"num_tokens": 220644.0,
|
|
"reward": 1.9765175580978394,
|
|
"reward_std": 0.13653874397277832,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.36422252655029297,
|
|
"rewards/belief_accuracy/std": 0.045512910932302475,
|
|
"rewards/env_reward/mean": 0.555899977684021,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.07833333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.8292455673217773,
|
|
"kl": 0.19214149564504623,
|
|
"learning_rate": 4.263888888888889e-05,
|
|
"loss": 0.0077,
|
|
"num_tokens": 223084.0,
|
|
"reward": 1.740675449371338,
|
|
"reward_std": 0.15911391377449036,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3569084405899048,
|
|
"rewards/belief_accuracy/std": 0.05303797498345375,
|
|
"rewards/env_reward/mean": 0.4133000075817108,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.07916666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.2383766174316406,
|
|
"kl": 0.572553887963295,
|
|
"learning_rate": 4.25e-05,
|
|
"loss": 0.0229,
|
|
"num_tokens": 225524.0,
|
|
"reward": 1.7529411315917969,
|
|
"reward_std": 0.2737933397293091,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22559703886508942,
|
|
"rewards/belief_accuracy/std": 0.09126444160938263,
|
|
"rewards/env_reward/mean": 0.6840999722480774,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.08,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.971638798713684,
|
|
"kl": 0.5628382042050362,
|
|
"learning_rate": 4.236111111111111e-05,
|
|
"loss": 0.0225,
|
|
"num_tokens": 227963.0,
|
|
"reward": 1.681844711303711,
|
|
"reward_std": 0.03878229856491089,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.34314823150634766,
|
|
"rewards/belief_accuracy/std": 0.012927442789077759,
|
|
"rewards/env_reward/mean": 0.4016000032424927,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.08083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9865643978118896,
|
|
"kl": 0.22193955443799496,
|
|
"learning_rate": 4.222222222222222e-05,
|
|
"loss": 0.0089,
|
|
"num_tokens": 230403.0,
|
|
"reward": 1.3422417640686035,
|
|
"reward_std": 0.10156978666782379,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.33229726552963257,
|
|
"rewards/belief_accuracy/std": 0.03385661542415619,
|
|
"rewards/env_reward/mean": 0.19689999520778656,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.08166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4794961214065552,
|
|
"kl": 0.2080898880958557,
|
|
"learning_rate": 4.208333333333334e-05,
|
|
"loss": 0.0083,
|
|
"num_tokens": 232843.0,
|
|
"reward": -0.02455127239227295,
|
|
"reward_std": 3.4839961528778076,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.16232874989509583,
|
|
"rewards/belief_accuracy/std": 0.2421344518661499,
|
|
"rewards/env_reward/mean": -0.3410249948501587,
|
|
"rewards/env_reward/std": 1.7726500034332275,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 98
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0825,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3845081329345703,
|
|
"kl": 0.34126274287700653,
|
|
"learning_rate": 4.194444444444445e-05,
|
|
"loss": 0.0137,
|
|
"num_tokens": 235283.0,
|
|
"reward": 1.6563684940338135,
|
|
"reward_std": 0.09499318897724152,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.337656170129776,
|
|
"rewards/belief_accuracy/std": 0.031664397567510605,
|
|
"rewards/env_reward/mean": 0.39559999108314514,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 9.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.08333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.240288257598877,
|
|
"kl": 0.7164427638053894,
|
|
"learning_rate": 4.1805555555555556e-05,
|
|
"loss": 0.0287,
|
|
"num_tokens": 237235.0,
|
|
"reward": 1.2336010932922363,
|
|
"reward_std": 0.6653106212615967,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.18393371999263763,
|
|
"rewards/belief_accuracy/std": 0.15957118570804596,
|
|
"rewards/env_reward/mean": 0.4211999773979187,
|
|
"rewards/env_reward/std": 0.41892534494400024,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.08416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.361682176589966,
|
|
"kl": 0.2631294522434473,
|
|
"learning_rate": 4.166666666666667e-05,
|
|
"loss": 0.0105,
|
|
"num_tokens": 239675.0,
|
|
"reward": 1.886284351348877,
|
|
"reward_std": 0.20986472070217133,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24569477140903473,
|
|
"rewards/belief_accuracy/std": 0.06995491683483124,
|
|
"rewards/env_reward/mean": 0.7328000068664551,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.085,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.8391385078430176,
|
|
"kl": 0.24248860776424408,
|
|
"learning_rate": 4.152777777777778e-05,
|
|
"loss": 0.0097,
|
|
"num_tokens": 241607.0,
|
|
"reward": 1.4985287189483643,
|
|
"reward_std": 0.12530909478664398,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3252928853034973,
|
|
"rewards/belief_accuracy/std": 0.041769690811634064,
|
|
"rewards/env_reward/mean": 0.3151000142097473,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.08583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 21.890640258789062,
|
|
"kl": 0.5941529143601656,
|
|
"learning_rate": 4.138888888888889e-05,
|
|
"loss": 0.0238,
|
|
"num_tokens": 244069.0,
|
|
"reward": 0.44205623865127563,
|
|
"reward_std": 0.18975834548473358,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22688540816307068,
|
|
"rewards/belief_accuracy/std": 0.06325279176235199,
|
|
"rewards/env_reward/mean": -0.1923999935388565,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.08666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.787078380584717,
|
|
"kl": 0.2444281131029129,
|
|
"learning_rate": 4.125e-05,
|
|
"loss": 0.0098,
|
|
"num_tokens": 246509.0,
|
|
"reward": 1.6253175735473633,
|
|
"reward_std": 0.14385031163692474,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3388558626174927,
|
|
"rewards/belief_accuracy/std": 0.04795009270310402,
|
|
"rewards/env_reward/mean": 0.3725000023841858,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7878473997116089,
|
|
"kl": 0.2907773107290268,
|
|
"learning_rate": 4.111111111111111e-05,
|
|
"loss": 0.0116,
|
|
"num_tokens": 248949.0,
|
|
"reward": 1.292004108428955,
|
|
"reward_std": 0.19697506725788116,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.26045137643814087,
|
|
"rewards/belief_accuracy/std": 0.06565836817026138,
|
|
"rewards/env_reward/mean": 0.30709999799728394,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.08833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9713439345359802,
|
|
"kl": 0.5633938759565353,
|
|
"learning_rate": 4.0972222222222225e-05,
|
|
"loss": 0.0225,
|
|
"num_tokens": 251389.0,
|
|
"reward": 1.2565593719482422,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37923645973205566,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.045899998396635056,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.08916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.3064583539962769,
|
|
"kl": 0.3265066295862198,
|
|
"learning_rate": 4.0833333333333334e-05,
|
|
"loss": 0.0131,
|
|
"num_tokens": 253829.0,
|
|
"reward": 1.9252145290374756,
|
|
"reward_std": 0.06414999067783356,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2697714567184448,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.7106000185012817,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.09,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1808035373687744,
|
|
"kl": 0.33351393789052963,
|
|
"learning_rate": 4.0694444444444444e-05,
|
|
"loss": 0.0133,
|
|
"num_tokens": 256269.0,
|
|
"reward": 0.6878499984741211,
|
|
"reward_std": 0.24345053732395172,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1825166642665863,
|
|
"rewards/belief_accuracy/std": 0.08115018159151077,
|
|
"rewards/env_reward/mean": 0.06019999831914902,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.09083333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.2036610841751099,
|
|
"kl": 0.23895448073744774,
|
|
"learning_rate": 4.055555555555556e-05,
|
|
"loss": 0.0096,
|
|
"num_tokens": 258709.0,
|
|
"reward": 1.0095196962356567,
|
|
"reward_std": 0.07835428416728973,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28903988003730774,
|
|
"rewards/belief_accuracy/std": 0.026118090376257896,
|
|
"rewards/env_reward/mean": 0.06159999966621399,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.09166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1538102626800537,
|
|
"kl": 0.4214295297861099,
|
|
"learning_rate": 4.041666666666667e-05,
|
|
"loss": 0.0169,
|
|
"num_tokens": 260824.0,
|
|
"reward": 1.422867774963379,
|
|
"reward_std": 0.5747905373573303,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3610350489616394,
|
|
"rewards/belief_accuracy/std": 0.08047888427972794,
|
|
"rewards/env_reward/mean": 0.19317498803138733,
|
|
"rewards/env_reward/std": 0.26084083318710327,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0925,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.361459255218506,
|
|
"kl": 0.5647962130606174,
|
|
"learning_rate": 4.027777777777778e-05,
|
|
"loss": 0.0226,
|
|
"num_tokens": 263286.0,
|
|
"reward": 0.67694091796875,
|
|
"reward_std": 0.2001858800649643,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.30193030834198,
|
|
"rewards/belief_accuracy/std": 0.0667286217212677,
|
|
"rewards/env_reward/mean": -0.1859000027179718,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.09333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.111879348754883,
|
|
"kl": 0.48548348993062973,
|
|
"learning_rate": 4.0138888888888894e-05,
|
|
"loss": 0.0194,
|
|
"num_tokens": 265395.0,
|
|
"reward": 1.0738086700439453,
|
|
"reward_std": 0.5100609660148621,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32505708932876587,
|
|
"rewards/belief_accuracy/std": 0.06295914947986603,
|
|
"rewards/env_reward/mean": 0.03242500126361847,
|
|
"rewards/env_reward/std": 0.22145001590251923,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.09416666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7761991024017334,
|
|
"kl": 0.44813764840364456,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.0179,
|
|
"num_tokens": 267835.0,
|
|
"reward": 1.1807773113250732,
|
|
"reward_std": 0.09589369595050812,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.36737579107284546,
|
|
"rewards/belief_accuracy/std": 0.031964562833309174,
|
|
"rewards/env_reward/mean": 0.019099999219179153,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.095,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.152681827545166,
|
|
"kl": 0.19383717328310013,
|
|
"learning_rate": 3.986111111111111e-05,
|
|
"loss": 0.0078,
|
|
"num_tokens": 270275.0,
|
|
"reward": 1.1284863948822021,
|
|
"reward_std": 0.14397799968719482,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.34184545278549194,
|
|
"rewards/belief_accuracy/std": 0.04799266904592514,
|
|
"rewards/env_reward/mean": 0.03530000150203705,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.09583333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.2871294021606445,
|
|
"kl": 0.35344624519348145,
|
|
"learning_rate": 3.972222222222222e-05,
|
|
"loss": 0.0141,
|
|
"num_tokens": 272715.0,
|
|
"reward": 2.019130229949951,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2832767069339752,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.7462000250816345,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.09666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.147212982177734,
|
|
"kl": 0.33963513001799583,
|
|
"learning_rate": 3.958333333333333e-05,
|
|
"loss": 0.0136,
|
|
"num_tokens": 274441.0,
|
|
"reward": 0.8702385425567627,
|
|
"reward_std": 0.22163395583629608,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.33136284351348877,
|
|
"rewards/belief_accuracy/std": 0.05834528058767319,
|
|
"rewards/env_reward/mean": -0.11590000241994858,
|
|
"rewards/env_reward/std": 0.22251078486442566,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0975,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5441172122955322,
|
|
"kl": 0.2427344862371683,
|
|
"learning_rate": 3.944444444444445e-05,
|
|
"loss": 0.0097,
|
|
"num_tokens": 276903.0,
|
|
"reward": 1.56264066696167,
|
|
"reward_std": 0.22222228348255157,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3731135427951813,
|
|
"rewards/belief_accuracy/std": 0.07407407462596893,
|
|
"rewards/env_reward/mean": 0.2621999979019165,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.09833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.864804267883301,
|
|
"kl": 0.4837944805622101,
|
|
"learning_rate": 3.9305555555555556e-05,
|
|
"loss": 0.0194,
|
|
"num_tokens": 279013.0,
|
|
"reward": 1.7349917888641357,
|
|
"reward_std": 0.3733745217323303,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3373388946056366,
|
|
"rewards/belief_accuracy/std": 0.03984580561518669,
|
|
"rewards/env_reward/mean": 0.4486500024795532,
|
|
"rewards/env_reward/std": 0.24901118874549866,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.09916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8735086917877197,
|
|
"kl": 0.2612016424536705,
|
|
"learning_rate": 3.9166666666666665e-05,
|
|
"loss": 0.0104,
|
|
"num_tokens": 281453.0,
|
|
"reward": 1.2192261219024658,
|
|
"reward_std": 0.18506474792957306,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2666420042514801,
|
|
"rewards/belief_accuracy/std": 0.061688266694545746,
|
|
"rewards/env_reward/mean": 0.24619999527931213,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.73070228099823,
|
|
"kl": 0.5154507085680962,
|
|
"learning_rate": 3.902777777777778e-05,
|
|
"loss": 0.0206,
|
|
"num_tokens": 283893.0,
|
|
"reward": 1.4962975978851318,
|
|
"reward_std": 0.13757789134979248,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31319916248321533,
|
|
"rewards/belief_accuracy/std": 0.045859288424253464,
|
|
"rewards/env_reward/mean": 0.337799996137619,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.10083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9448511600494385,
|
|
"kl": 0.602842066437006,
|
|
"learning_rate": 3.888888888888889e-05,
|
|
"loss": 0.0241,
|
|
"num_tokens": 286339.0,
|
|
"reward": 1.649629831314087,
|
|
"reward_std": 0.10207336395978928,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4207599461078644,
|
|
"rewards/belief_accuracy/std": 0.0340244434773922,
|
|
"rewards/env_reward/mean": 0.2249000072479248,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.10166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.713958501815796,
|
|
"kl": 0.38420072570443153,
|
|
"learning_rate": 3.875e-05,
|
|
"loss": 0.0154,
|
|
"num_tokens": 288779.0,
|
|
"reward": 1.3020188808441162,
|
|
"reward_std": 0.17013852298259735,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.20623962581157684,
|
|
"rewards/belief_accuracy/std": 0.05671283230185509,
|
|
"rewards/env_reward/mean": 0.4221999943256378,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8310298919677734,
|
|
"kl": 0.19116491079330444,
|
|
"learning_rate": 3.8611111111111116e-05,
|
|
"loss": 0.0076,
|
|
"num_tokens": 291219.0,
|
|
"reward": 0.7856130599975586,
|
|
"reward_std": 0.1111110970377922,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3014543652534485,
|
|
"rewards/belief_accuracy/std": 0.03703703731298447,
|
|
"rewards/env_reward/mean": -0.11249999701976776,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.10333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.588038444519043,
|
|
"kl": 0.5185928121209145,
|
|
"learning_rate": 3.8472222222222225e-05,
|
|
"loss": 0.0207,
|
|
"num_tokens": 293665.0,
|
|
"reward": 1.210307240486145,
|
|
"reward_std": 0.10638084262609482,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22401908040046692,
|
|
"rewards/belief_accuracy/std": 0.03546026349067688,
|
|
"rewards/env_reward/mean": 0.3255000114440918,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.10416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.076918363571167,
|
|
"kl": 0.277778223156929,
|
|
"learning_rate": 3.8333333333333334e-05,
|
|
"loss": 0.0111,
|
|
"num_tokens": 296106.0,
|
|
"reward": 1.3835383653640747,
|
|
"reward_std": 0.09814538806676865,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3789128065109253,
|
|
"rewards/belief_accuracy/std": 0.03271512687206268,
|
|
"rewards/env_reward/mean": 0.13120000064373016,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.105,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4967398643493652,
|
|
"kl": 0.3801155686378479,
|
|
"learning_rate": 3.8194444444444444e-05,
|
|
"loss": 0.0152,
|
|
"num_tokens": 298546.0,
|
|
"reward": 1.9718296527862549,
|
|
"reward_std": 0.14045073091983795,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.35695987939834595,
|
|
"rewards/belief_accuracy/std": 0.04681692272424698,
|
|
"rewards/env_reward/mean": 0.567300021648407,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.10583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.761016368865967,
|
|
"kl": 0.32585281878709793,
|
|
"learning_rate": 3.805555555555555e-05,
|
|
"loss": 0.013,
|
|
"num_tokens": 300658.0,
|
|
"reward": 2.205615282058716,
|
|
"reward_std": 0.4304605722427368,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.339000940322876,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.7590750455856323,
|
|
"rewards/env_reward/std": 0.2601499855518341,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.10666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.960784435272217,
|
|
"kl": 1.1858660280704498,
|
|
"learning_rate": 3.791666666666667e-05,
|
|
"loss": 0.0474,
|
|
"num_tokens": 303102.0,
|
|
"reward": 1.9060065746307373,
|
|
"reward_std": 0.24637286365032196,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3400188684463501,
|
|
"rewards/belief_accuracy/std": 0.08212429285049438,
|
|
"rewards/env_reward/mean": 0.5572999715805054,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.856426954269409,
|
|
"kl": 0.44381893426179886,
|
|
"learning_rate": 3.777777777777778e-05,
|
|
"loss": 0.0178,
|
|
"num_tokens": 305220.0,
|
|
"reward": 1.8646245002746582,
|
|
"reward_std": 0.11111116409301758,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.18142487108707428,
|
|
"rewards/belief_accuracy/std": 0.03703703731298447,
|
|
"rewards/env_reward/mean": 0.8468999862670898,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.10833333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8265440464019775,
|
|
"kl": 0.38515937700867653,
|
|
"learning_rate": 3.763888888888889e-05,
|
|
"loss": 0.0154,
|
|
"num_tokens": 307152.0,
|
|
"reward": 1.3294965028762817,
|
|
"reward_std": 0.09841816127300262,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.36771130561828613,
|
|
"rewards/belief_accuracy/std": 0.07701562345027924,
|
|
"rewards/env_reward/mean": 0.11757499724626541,
|
|
"rewards/env_reward/std": 0.18415001034736633,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.10916666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.664041519165039,
|
|
"kl": 0.21777665056288242,
|
|
"learning_rate": 3.7500000000000003e-05,
|
|
"loss": 0.0087,
|
|
"num_tokens": 309592.0,
|
|
"reward": 1.1950082778930664,
|
|
"reward_std": 0.15713481605052948,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3575194478034973,
|
|
"rewards/belief_accuracy/std": 0.052378278225660324,
|
|
"rewards/env_reward/mean": 0.04830000177025795,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.597440481185913,
|
|
"kl": 1.7791741862893105,
|
|
"learning_rate": 3.736111111111111e-05,
|
|
"loss": 0.0712,
|
|
"num_tokens": 312035.0,
|
|
"reward": 2.0074949264526367,
|
|
"reward_std": 0.09213761240243912,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4375982880592346,
|
|
"rewards/belief_accuracy/std": 0.030712537467479706,
|
|
"rewards/env_reward/mean": 0.42980000376701355,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11083333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.8326061964035034,
|
|
"kl": 0.35671964287757874,
|
|
"learning_rate": 3.722222222222222e-05,
|
|
"loss": 0.0143,
|
|
"num_tokens": 314335.0,
|
|
"reward": 1.8839524984359741,
|
|
"reward_std": 0.048975855112075806,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.43421751260757446,
|
|
"rewards/belief_accuracy/std": 0.016325272619724274,
|
|
"rewards/env_reward/mean": 0.35420000553131104,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.641361951828003,
|
|
"kl": 0.3387632220983505,
|
|
"learning_rate": 3.708333333333334e-05,
|
|
"loss": 0.0136,
|
|
"num_tokens": 316447.0,
|
|
"reward": 2.0452380180358887,
|
|
"reward_std": 0.317529559135437,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.443654328584671,
|
|
"rewards/belief_accuracy/std": 0.021532177925109863,
|
|
"rewards/env_reward/mean": 0.4428499937057495,
|
|
"rewards/env_reward/std": 0.1823849380016327,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 23.68372917175293,
|
|
"kl": 9.03631467744708,
|
|
"learning_rate": 3.694444444444445e-05,
|
|
"loss": 0.3615,
|
|
"num_tokens": 318887.0,
|
|
"reward": 1.3748897314071655,
|
|
"reward_std": 0.185723215341568,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.35687991976737976,
|
|
"rewards/belief_accuracy/std": 0.06190773472189903,
|
|
"rewards/env_reward/mean": 0.16949999332427979,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9297913312911987,
|
|
"kl": 0.3083711676299572,
|
|
"learning_rate": 3.6805555555555556e-05,
|
|
"loss": 0.0123,
|
|
"num_tokens": 321327.0,
|
|
"reward": 2.360555648803711,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.25393518805503845,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 1.0325000286102295,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.193662166595459,
|
|
"kl": 0.33080876618623734,
|
|
"learning_rate": 3.6666666666666666e-05,
|
|
"loss": 0.0132,
|
|
"num_tokens": 323806.0,
|
|
"reward": 1.2503528594970703,
|
|
"reward_std": 0.10516492277383804,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3806675672531128,
|
|
"rewards/belief_accuracy/std": 0.03505498543381691,
|
|
"rewards/env_reward/mean": 0.03889999911189079,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.115,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0646451711654663,
|
|
"kl": 0.4817114397883415,
|
|
"learning_rate": 3.6527777777777775e-05,
|
|
"loss": 0.0193,
|
|
"num_tokens": 326246.0,
|
|
"reward": 1.6292959451675415,
|
|
"reward_std": 0.10631169378757477,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.42438197135925293,
|
|
"rewards/belief_accuracy/std": 0.03543723374605179,
|
|
"rewards/env_reward/mean": 0.20409999787807465,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5362420082092285,
|
|
"kl": 0.3246644027531147,
|
|
"learning_rate": 3.638888888888889e-05,
|
|
"loss": 0.013,
|
|
"num_tokens": 328686.0,
|
|
"reward": 1.1507596969604492,
|
|
"reward_std": 0.09127989411354065,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3048698902130127,
|
|
"rewards/belief_accuracy/std": 0.028253890573978424,
|
|
"rewards/env_reward/mean": 0.12409999966621399,
|
|
"rewards/env_reward/std": 0.02619999647140503,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2923882007598877,
|
|
"kl": 0.3384244814515114,
|
|
"learning_rate": 3.625e-05,
|
|
"loss": 0.0135,
|
|
"num_tokens": 330414.0,
|
|
"reward": 1.2623817920684814,
|
|
"reward_std": 0.3048122227191925,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28640225529670715,
|
|
"rewards/belief_accuracy/std": 0.01621234230697155,
|
|
"rewards/env_reward/mean": 0.23544999957084656,
|
|
"rewards/env_reward/std": 0.234499990940094,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7288119792938232,
|
|
"kl": 0.36574723571538925,
|
|
"learning_rate": 3.611111111111111e-05,
|
|
"loss": 0.0146,
|
|
"num_tokens": 332854.0,
|
|
"reward": 1.373946189880371,
|
|
"reward_std": 0.09072189033031464,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.27031537890434265,
|
|
"rewards/belief_accuracy/std": 0.03024062141776085,
|
|
"rewards/env_reward/mean": 0.34200000762939453,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0891668796539307,
|
|
"kl": 0.3505154550075531,
|
|
"learning_rate": 3.5972222222222225e-05,
|
|
"loss": 0.014,
|
|
"num_tokens": 334970.0,
|
|
"reward": 0.7695170640945435,
|
|
"reward_std": 0.18945620954036713,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2285890281200409,
|
|
"rewards/belief_accuracy/std": 0.06315205991268158,
|
|
"rewards/env_reward/mean": 0.02250000089406967,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.11916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0063207149505615,
|
|
"kl": 0.2529054693877697,
|
|
"learning_rate": 3.5833333333333335e-05,
|
|
"loss": 0.0101,
|
|
"num_tokens": 337410.0,
|
|
"reward": 0.9304206371307373,
|
|
"reward_std": 0.055555541068315506,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.35887354612350464,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": -0.13079999387264252,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.956833839416504,
|
|
"kl": 0.5815155953168869,
|
|
"learning_rate": 3.5694444444444444e-05,
|
|
"loss": 0.0233,
|
|
"num_tokens": 339850.0,
|
|
"reward": 1.0631434917449951,
|
|
"reward_std": 0.166666641831398,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.1861644983291626,
|
|
"rewards/belief_accuracy/std": 0.0555555522441864,
|
|
"rewards/env_reward/mean": 0.30309998989105225,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 10.333333969116211,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.329380035400391,
|
|
"kl": 0.8242634683847427,
|
|
"learning_rate": 3.555555555555556e-05,
|
|
"loss": 0.033,
|
|
"num_tokens": 342313.0,
|
|
"reward": 1.0987353324890137,
|
|
"reward_std": 0.14344383776187897,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3755284249782562,
|
|
"rewards/belief_accuracy/std": 0.04781460389494896,
|
|
"rewards/env_reward/mean": -0.051899999380111694,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.634685754776001,
|
|
"kl": 0.39855731278657913,
|
|
"learning_rate": 3.541666666666667e-05,
|
|
"loss": 0.0159,
|
|
"num_tokens": 344753.0,
|
|
"reward": 1.4183378219604492,
|
|
"reward_std": 0.06828323751688004,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31596264243125916,
|
|
"rewards/belief_accuracy/std": 0.02276109904050827,
|
|
"rewards/env_reward/mean": 0.28029999136924744,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.3948684930801392,
|
|
"kl": 0.38383544236421585,
|
|
"learning_rate": 3.527777777777778e-05,
|
|
"loss": 0.0154,
|
|
"num_tokens": 347193.0,
|
|
"reward": 1.1515657901763916,
|
|
"reward_std": 0.06009494140744209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2850886285305023,
|
|
"rewards/belief_accuracy/std": 0.020031645894050598,
|
|
"rewards/env_reward/mean": 0.16419999301433563,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.6996270418167114,
|
|
"kl": 0.6058650612831116,
|
|
"learning_rate": 3.513888888888889e-05,
|
|
"loss": 0.0242,
|
|
"num_tokens": 349633.0,
|
|
"reward": 1.1596319675445557,
|
|
"reward_std": 0.10638084262609482,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.30072730779647827,
|
|
"rewards/belief_accuracy/std": 0.03546027094125748,
|
|
"rewards/env_reward/mean": 0.13830000162124634,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12416666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.426046848297119,
|
|
"kl": 0.34162479639053345,
|
|
"learning_rate": 3.5e-05,
|
|
"loss": 0.0137,
|
|
"num_tokens": 352073.0,
|
|
"reward": 1.1792874336242676,
|
|
"reward_std": 0.09853797405958176,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31412917375564575,
|
|
"rewards/belief_accuracy/std": 0.03284599259495735,
|
|
"rewards/env_reward/mean": 0.12460000067949295,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 12.25,
|
|
"completions/mean_terminated_length": 12.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.012240409851074,
|
|
"kl": 0.6527384743094444,
|
|
"learning_rate": 3.486111111111111e-05,
|
|
"loss": 0.0261,
|
|
"num_tokens": 354522.0,
|
|
"reward": 0.9352785348892212,
|
|
"reward_std": 0.07883862406015396,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.26061785221099854,
|
|
"rewards/belief_accuracy/std": 0.030240608379244804,
|
|
"rewards/env_reward/mean": 0.06894999742507935,
|
|
"rewards/env_reward/std": 0.010099999606609344,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12583333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.943872451782227,
|
|
"kl": 0.6852549761533737,
|
|
"learning_rate": 3.472222222222222e-05,
|
|
"loss": 0.0274,
|
|
"num_tokens": 356966.0,
|
|
"reward": 1.8197388648986816,
|
|
"reward_std": 0.12830005586147308,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28171297907829285,
|
|
"rewards/belief_accuracy/std": 0.04276669770479202,
|
|
"rewards/env_reward/mean": 0.6164000034332275,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12666666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.472937822341919,
|
|
"kl": 0.3669867143034935,
|
|
"learning_rate": 3.458333333333333e-05,
|
|
"loss": 0.0147,
|
|
"num_tokens": 359407.0,
|
|
"reward": 1.2027755975723267,
|
|
"reward_std": 0.02803901769220829,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3199460506439209,
|
|
"rewards/belief_accuracy/std": 0.009259253740310669,
|
|
"rewards/env_reward/mean": 0.12862500548362732,
|
|
"rewards/env_reward/std": 0.012850001454353333,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1275,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7880311012268066,
|
|
"kl": 0.8225667700171471,
|
|
"learning_rate": 3.444444444444445e-05,
|
|
"loss": 0.0329,
|
|
"num_tokens": 361519.0,
|
|
"reward": 2.0395917892456055,
|
|
"reward_std": 0.3787125051021576,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3322972357273102,
|
|
"rewards/belief_accuracy/std": 0.03908432275056839,
|
|
"rewards/env_reward/mean": 0.6618000268936157,
|
|
"rewards/env_reward/std": 0.20403560996055603,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.15445613861084,
|
|
"kl": 0.41129303723573685,
|
|
"learning_rate": 3.430555555555556e-05,
|
|
"loss": 0.0165,
|
|
"num_tokens": 363959.0,
|
|
"reward": 0.5192468166351318,
|
|
"reward_std": 0.08724445104598999,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.17216560244560242,
|
|
"rewards/belief_accuracy/std": 0.018518514931201935,
|
|
"rewards/env_reward/mean": -0.03149999678134918,
|
|
"rewards/env_reward/std": 0.09520000219345093,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.12916666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1737233400344849,
|
|
"kl": 0.35325073450803757,
|
|
"learning_rate": 3.4166666666666666e-05,
|
|
"loss": 0.0141,
|
|
"num_tokens": 366399.0,
|
|
"reward": 0.9217313528060913,
|
|
"reward_std": 0.055555522441864014,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2698771059513092,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.0414000004529953,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.461524963378906,
|
|
"kl": 0.7672573626041412,
|
|
"learning_rate": 3.402777777777778e-05,
|
|
"loss": 0.0307,
|
|
"num_tokens": 368842.0,
|
|
"reward": 1.8642927408218384,
|
|
"reward_std": 0.12469932436943054,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.40906423330307007,
|
|
"rewards/belief_accuracy/std": 0.04156642034649849,
|
|
"rewards/env_reward/mean": 0.391400009393692,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.6521883010864258,
|
|
"kl": 0.3942847102880478,
|
|
"learning_rate": 3.388888888888889e-05,
|
|
"loss": 0.0158,
|
|
"num_tokens": 371282.0,
|
|
"reward": 1.6033741235733032,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37149137258529663,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.29260000586509705,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13166666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.972644329071045,
|
|
"kl": 0.40470410883426666,
|
|
"learning_rate": 3.375000000000001e-05,
|
|
"loss": 0.0162,
|
|
"num_tokens": 373722.0,
|
|
"reward": 0.5326824188232422,
|
|
"reward_std": 0.02388697862625122,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.29159414768218994,
|
|
"rewards/belief_accuracy/std": 0.007962316274642944,
|
|
"rewards/env_reward/mean": -0.2614000141620636,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1325,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.185356855392456,
|
|
"kl": 0.3720128685235977,
|
|
"learning_rate": 3.3611111111111116e-05,
|
|
"loss": 0.0149,
|
|
"num_tokens": 376162.0,
|
|
"reward": 1.7469711303710938,
|
|
"reward_std": 0.039430540055036545,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3106195032596588,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.5100749731063843,
|
|
"rewards/env_reward/std": 0.01074999663978815,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.548361301422119,
|
|
"kl": 0.527301162481308,
|
|
"learning_rate": 3.347222222222222e-05,
|
|
"loss": 0.0211,
|
|
"num_tokens": 378607.0,
|
|
"reward": 1.302304744720459,
|
|
"reward_std": 0.1430416703224182,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3570098876953125,
|
|
"rewards/belief_accuracy/std": 0.030240608379244804,
|
|
"rewards/env_reward/mean": 0.12084999680519104,
|
|
"rewards/env_reward/std": 0.07372763007879257,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13416666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.99193000793457,
|
|
"kl": 0.3444783613085747,
|
|
"learning_rate": 3.3333333333333335e-05,
|
|
"loss": 0.0138,
|
|
"num_tokens": 381047.0,
|
|
"reward": 1.1595714092254639,
|
|
"reward_std": 0.08464870601892471,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.30360713601112366,
|
|
"rewards/belief_accuracy/std": 0.02821623906493187,
|
|
"rewards/env_reward/mean": 0.13249999284744263,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.135,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1529526710510254,
|
|
"kl": 0.4582265689969063,
|
|
"learning_rate": 3.3194444444444444e-05,
|
|
"loss": 0.0183,
|
|
"num_tokens": 383509.0,
|
|
"reward": 1.349330186843872,
|
|
"reward_std": 0.10180103778839111,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3788100481033325,
|
|
"rewards/belief_accuracy/std": 0.03024062141776085,
|
|
"rewards/env_reward/mean": 0.10860000550746918,
|
|
"rewards/env_reward/std": 0.009930425323545933,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7175657749176025,
|
|
"kl": 0.37873465567827225,
|
|
"learning_rate": 3.3055555555555553e-05,
|
|
"loss": 0.0151,
|
|
"num_tokens": 385625.0,
|
|
"reward": 1.6891059875488281,
|
|
"reward_std": 0.6854898929595947,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.328631192445755,
|
|
"rewards/belief_accuracy/std": 0.024180419743061066,
|
|
"rewards/env_reward/mean": 0.4354749917984009,
|
|
"rewards/env_reward/std": 0.4491499960422516,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.273981094360352,
|
|
"kl": 0.3159671910107136,
|
|
"learning_rate": 3.291666666666667e-05,
|
|
"loss": 0.0126,
|
|
"num_tokens": 388065.0,
|
|
"reward": 2.1328177452087402,
|
|
"reward_std": 0.10624248534440994,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44292253255844116,
|
|
"rewards/belief_accuracy/std": 0.035414163023233414,
|
|
"rewards/env_reward/mean": 0.5026999711990356,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6132190227508545,
|
|
"kl": 0.4029811918735504,
|
|
"learning_rate": 3.277777777777778e-05,
|
|
"loss": 0.0161,
|
|
"num_tokens": 390505.0,
|
|
"reward": 0.6693507432937622,
|
|
"reward_std": 0.06415002793073654,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.29170024394989014,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": -0.1704999953508377,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 12.75,
|
|
"completions/mean_terminated_length": 12.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13833333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.097934246063232,
|
|
"kl": 1.0165367349982262,
|
|
"learning_rate": 3.263888888888889e-05,
|
|
"loss": 0.0407,
|
|
"num_tokens": 392956.0,
|
|
"reward": 1.1792399883270264,
|
|
"reward_std": 0.04536087065935135,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.35646331310272217,
|
|
"rewards/belief_accuracy/std": 0.015120310708880424,
|
|
"rewards/env_reward/mean": 0.039900001138448715,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.13916666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.14641809463501,
|
|
"kl": 0.4241368919610977,
|
|
"learning_rate": 3.2500000000000004e-05,
|
|
"loss": 0.017,
|
|
"num_tokens": 395400.0,
|
|
"reward": 1.125349521636963,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.30679982900619507,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.10329999774694443,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.288893222808838,
|
|
"kl": 0.6163829118013382,
|
|
"learning_rate": 3.236111111111111e-05,
|
|
"loss": 0.0247,
|
|
"num_tokens": 397873.0,
|
|
"reward": 1.3380489349365234,
|
|
"reward_std": 0.051960092037916183,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2795746326446533,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.29954999685287476,
|
|
"rewards/env_reward/std": 0.03827832639217377,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14083333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.591170787811279,
|
|
"kl": 0.5263254791498184,
|
|
"learning_rate": 3.222222222222223e-05,
|
|
"loss": 0.0211,
|
|
"num_tokens": 400329.0,
|
|
"reward": 1.7974923849105835,
|
|
"reward_std": 0.030818266794085503,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3927474319934845,
|
|
"rewards/belief_accuracy/std": 0.010272752493619919,
|
|
"rewards/env_reward/mean": 0.37950000166893005,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14166666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.159360885620117,
|
|
"kl": 0.7782266139984131,
|
|
"learning_rate": 3.208333333333334e-05,
|
|
"loss": 0.0311,
|
|
"num_tokens": 402773.0,
|
|
"reward": 1.3888577222824097,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.34553590416908264,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.20149999856948853,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1425,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.983961343765259,
|
|
"kl": 1.172668918967247,
|
|
"learning_rate": 3.194444444444444e-05,
|
|
"loss": 0.0469,
|
|
"num_tokens": 405229.0,
|
|
"reward": 1.5690133571624756,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4289877712726593,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.15469999611377716,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0448641777038574,
|
|
"kl": 0.31760095804929733,
|
|
"learning_rate": 3.180555555555556e-05,
|
|
"loss": 0.0127,
|
|
"num_tokens": 407669.0,
|
|
"reward": 1.2858712673187256,
|
|
"reward_std": 0.013649980537593365,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3892820477485657,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.04535000026226044,
|
|
"rewards/env_reward/std": 0.009099999442696571,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.481169700622559,
|
|
"kl": 1.9302819445729256,
|
|
"learning_rate": 3.1666666666666666e-05,
|
|
"loss": 0.0772,
|
|
"num_tokens": 410116.0,
|
|
"reward": 1.5479769706726074,
|
|
"reward_std": 0.05555546283721924,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32457566261291504,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.34950000047683716,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.145,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.64962100982666,
|
|
"kl": 0.6141732186079025,
|
|
"learning_rate": 3.1527777777777775e-05,
|
|
"loss": 0.0246,
|
|
"num_tokens": 412564.0,
|
|
"reward": 2.0601048469543457,
|
|
"reward_std": 0.4084498882293701,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3570099174976349,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6260499954223633,
|
|
"rewards/env_reward/std": 0.27230000495910645,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14583333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 19.281583786010742,
|
|
"kl": 0.6186791881918907,
|
|
"learning_rate": 3.138888888888889e-05,
|
|
"loss": 0.0247,
|
|
"num_tokens": 415012.0,
|
|
"reward": 1.3272333145141602,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2631944417953491,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.32510000467300415,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14666666666666667,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0038161983247846365,
|
|
"kl": 0.32898589968681335,
|
|
"learning_rate": 3.125e-05,
|
|
"loss": 0.0132,
|
|
"num_tokens": 417452.0,
|
|
"reward": 1.3439536094665527,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.26061785221099854,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.34139999747276306,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1475,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.02249019406735897,
|
|
"kl": 0.5504751205444336,
|
|
"learning_rate": 3.111111111111111e-05,
|
|
"loss": 0.022,
|
|
"num_tokens": 419892.0,
|
|
"reward": 1.3552190065383911,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.42342302203178406,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.02329999953508377,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14833333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.004471380263566971,
|
|
"kl": 0.3276918828487396,
|
|
"learning_rate": 3.0972222222222226e-05,
|
|
"loss": 0.0131,
|
|
"num_tokens": 422332.0,
|
|
"reward": 1.6074572801589966,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3073524236679077,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4235999882221222,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.14916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.483266830444336,
|
|
"kl": 0.5065018609166145,
|
|
"learning_rate": 3.0833333333333335e-05,
|
|
"loss": 0.0203,
|
|
"num_tokens": 424776.0,
|
|
"reward": 1.071094274520874,
|
|
"reward_std": 0.012580275535583496,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2935647964477539,
|
|
"rewards/belief_accuracy/std": 0.004193440079689026,
|
|
"rewards/env_reward/mean": 0.09359999746084213,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.15,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.9969475865364075,
|
|
"kl": 0.444540049880743,
|
|
"learning_rate": 3.069444444444445e-05,
|
|
"loss": 0.0178,
|
|
"num_tokens": 427260.0,
|
|
"reward": 1.8959006071090698,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.38445019721984863,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4616999924182892,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.15083333333333335,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 10.70042610168457,
|
|
"kl": 1.6388273686170578,
|
|
"learning_rate": 3.055555555555556e-05,
|
|
"loss": 0.0656,
|
|
"num_tokens": 429703.0,
|
|
"reward": 1.3662413358688354,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4158470928668976,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.04580000042915344,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.15166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0670108795166016,
|
|
"kl": 1.061010330915451,
|
|
"learning_rate": 3.0416666666666666e-05,
|
|
"loss": 0.0424,
|
|
"num_tokens": 432159.0,
|
|
"reward": 1.199580192565918,
|
|
"reward_std": 0.059318430721759796,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3253183662891388,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.11574999988079071,
|
|
"rewards/env_reward/std": 0.004099187441170216,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1525,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.715631484985352,
|
|
"kl": 0.733733519911766,
|
|
"learning_rate": 3.0277777777777776e-05,
|
|
"loss": 0.0293,
|
|
"num_tokens": 434487.0,
|
|
"reward": 1.3816564083099365,
|
|
"reward_std": 0.19201354682445526,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22851046919822693,
|
|
"rewards/belief_accuracy/std": 0.018518514931201935,
|
|
"rewards/env_reward/mean": 0.4307500123977661,
|
|
"rewards/env_reward/std": 0.1354999989271164,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.15333333333333332,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 3.4244911670684814,
|
|
"kl": 0.615076094865799,
|
|
"learning_rate": 3.0138888888888888e-05,
|
|
"loss": 0.0246,
|
|
"num_tokens": 436931.0,
|
|
"reward": 1.1164329051971436,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.33457762002944946,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.04179999977350235,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.15416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.267205238342285,
|
|
"kl": 0.7707809656858444,
|
|
"learning_rate": 3e-05,
|
|
"loss": 0.0308,
|
|
"num_tokens": 439405.0,
|
|
"reward": 0.9776316285133362,
|
|
"reward_std": 0.05555548146367073,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3959105610847473,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": -0.17339999973773956,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.155,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.635013103485107,
|
|
"kl": 0.3450919836759567,
|
|
"learning_rate": 2.9861111111111113e-05,
|
|
"loss": 0.0138,
|
|
"num_tokens": 441845.0,
|
|
"reward": 1.654306173324585,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.40296870470046997,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.2635999917984009,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.15583333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 409.9649963378906,
|
|
"kl": 0.933860570192337,
|
|
"learning_rate": 2.9722222222222223e-05,
|
|
"loss": 0.0374,
|
|
"num_tokens": 444307.0,
|
|
"reward": 1.4882175922393799,
|
|
"reward_std": 0.005075931549072266,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.310455858707428,
|
|
"rewards/belief_accuracy/std": 0.0016919821500778198,
|
|
"rewards/env_reward/mean": 0.3379000127315521,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.15666666666666668,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.20684058964252472,
|
|
"kl": 1.212807685136795,
|
|
"learning_rate": 2.9583333333333335e-05,
|
|
"loss": 0.0485,
|
|
"num_tokens": 446763.0,
|
|
"reward": 1.4152576923370361,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.34553590416908264,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2190999984741211,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1575,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.166232109069824,
|
|
"kl": 0.45745784789323807,
|
|
"learning_rate": 2.9444444444444448e-05,
|
|
"loss": 0.0183,
|
|
"num_tokens": 448695.0,
|
|
"reward": 1.4528826475143433,
|
|
"reward_std": 0.2088000327348709,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.40842756628990173,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11839999258518219,
|
|
"rewards/env_reward/std": 0.13920000195503235,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.15833333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.026986636221408844,
|
|
"kl": 1.0310405790805817,
|
|
"learning_rate": 2.9305555555555557e-05,
|
|
"loss": 0.0412,
|
|
"num_tokens": 451151.0,
|
|
"reward": 0.7838923931121826,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22358080744743347,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.042100001126527786,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.15916666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.072092056274414,
|
|
"kl": 0.402814045548439,
|
|
"learning_rate": 2.916666666666667e-05,
|
|
"loss": 0.0161,
|
|
"num_tokens": 453591.0,
|
|
"reward": 1.2498445510864258,
|
|
"reward_std": 0.1845216602087021,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4303731918334961,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": -0.060850001871585846,
|
|
"rewards/env_reward/std": 0.13030001521110535,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 11.333333969116211,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.16,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 4.364622116088867,
|
|
"kl": 0.520959883928299,
|
|
"learning_rate": 2.9027777777777782e-05,
|
|
"loss": 0.0208,
|
|
"num_tokens": 456057.0,
|
|
"reward": 1.8143787384033203,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28782621026039124,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.600600004196167,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.16083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.7475361824035645,
|
|
"kl": 0.5597751773893833,
|
|
"learning_rate": 2.8888888888888888e-05,
|
|
"loss": 0.0224,
|
|
"num_tokens": 458531.0,
|
|
"reward": -0.030727386474609375,
|
|
"reward_std": 3.4795150756835938,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.23410753905773163,
|
|
"rewards/belief_accuracy/std": 0.2894050180912018,
|
|
"rewards/env_reward/mean": -0.4886999726295471,
|
|
"rewards/env_reward/std": 1.6742000579833984,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 193
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 15.25,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.16166666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.9581692218780518,
|
|
"kl": 0.9280332177877426,
|
|
"learning_rate": 2.8749999999999997e-05,
|
|
"loss": 0.0371,
|
|
"num_tokens": 460992.0,
|
|
"reward": 2.1100287437438965,
|
|
"reward_std": 0.2535001337528229,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28782621026039124,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7976999878883362,
|
|
"rewards/env_reward/std": 0.1690000295639038,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.230289936065674,
|
|
"kl": 0.9568361341953278,
|
|
"learning_rate": 2.861111111111111e-05,
|
|
"loss": 0.0383,
|
|
"num_tokens": 463448.0,
|
|
"reward": 1.7242618799209595,
|
|
"reward_std": 0.059025008231401443,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.46602481603622437,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.18412499129772186,
|
|
"rewards/env_reward/std": 0.0393499955534935,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.16333333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.7262094616889954,
|
|
"kl": 0.7856011986732483,
|
|
"learning_rate": 2.8472222222222223e-05,
|
|
"loss": 0.0314,
|
|
"num_tokens": 465900.0,
|
|
"reward": 0.789142370223999,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.22358080744743347,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.04560000076889992,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.16416666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.02980131469666958,
|
|
"kl": 0.49150124192237854,
|
|
"learning_rate": 2.8333333333333335e-05,
|
|
"loss": 0.0197,
|
|
"num_tokens": 468340.0,
|
|
"reward": 1.2274935245513916,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.269497811794281,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2460000067949295,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.165,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.6604632139205933,
|
|
"kl": 0.6369934007525444,
|
|
"learning_rate": 2.8194444444444445e-05,
|
|
"loss": 0.0255,
|
|
"num_tokens": 470784.0,
|
|
"reward": 1.088317632675171,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4617558717727661,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.2312999963760376,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 18.0,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.16583333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.6423118114471436,
|
|
"kl": 1.3361373841762543,
|
|
"learning_rate": 2.8055555555555557e-05,
|
|
"loss": 0.0534,
|
|
"num_tokens": 473256.0,
|
|
"reward": 1.2390079498291016,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.386385977268219,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.019899999722838402,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.16666666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.3381861746311188,
|
|
"kl": 0.7480977475643158,
|
|
"learning_rate": 2.791666666666667e-05,
|
|
"loss": 0.0299,
|
|
"num_tokens": 475704.0,
|
|
"reward": 1.2790579795837402,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.386385977268219,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.04659999907016754,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1675,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 3.254331588745117,
|
|
"kl": 1.571349710226059,
|
|
"learning_rate": 2.777777777777778e-05,
|
|
"loss": 0.0629,
|
|
"num_tokens": 478178.0,
|
|
"reward": 1.3384069204330444,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37561896443367004,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.10769999772310257,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.16833333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.34370848536491394,
|
|
"kl": 0.48092682659626007,
|
|
"learning_rate": 2.7638888888888892e-05,
|
|
"loss": 0.0192,
|
|
"num_tokens": 480621.0,
|
|
"reward": 1.2113056182861328,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3113018572330475,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.15160000324249268,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.16916666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 21.64349937438965,
|
|
"kl": 0.2917898967862129,
|
|
"learning_rate": 2.7500000000000004e-05,
|
|
"loss": 0.0117,
|
|
"num_tokens": 483083.0,
|
|
"reward": 1.6178982257843018,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.29648277163505554,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4523000121116638,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.17,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.006457047536969185,
|
|
"kl": 0.4465770423412323,
|
|
"learning_rate": 2.7361111111111114e-05,
|
|
"loss": 0.0179,
|
|
"num_tokens": 485523.0,
|
|
"reward": 1.2813693284988403,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.39455646276474,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.03180000185966492,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.17083333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0349791049957275,
|
|
"kl": 0.9023318737745285,
|
|
"learning_rate": 2.7222222222222223e-05,
|
|
"loss": 0.0361,
|
|
"num_tokens": 487829.0,
|
|
"reward": 1.9587321281433105,
|
|
"reward_std": 0.12462284415960312,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3476565480232239,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.5771750211715698,
|
|
"rewards/env_reward/std": 0.09858879446983337,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 20.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 12.5,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.17166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.28134536743164,
|
|
"kl": 0.6101161614060402,
|
|
"learning_rate": 2.7083333333333332e-05,
|
|
"loss": 0.0244,
|
|
"num_tokens": 490279.0,
|
|
"reward": 1.4633797407150269,
|
|
"reward_std": 0.007893800735473633,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44797658920288086,
|
|
"rewards/belief_accuracy/std": 0.002631261944770813,
|
|
"rewards/env_reward/mean": 0.046300001442432404,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1725,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.001407866133376956,
|
|
"kl": 0.5468098521232605,
|
|
"learning_rate": 2.6944444444444445e-05,
|
|
"loss": 0.0219,
|
|
"num_tokens": 492719.0,
|
|
"reward": 1.327993392944336,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.269497811794281,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.31299999356269836,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 12.25,
|
|
"completions/mean_terminated_length": 12.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.17333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.544673919677734,
|
|
"kl": 0.5928681045770645,
|
|
"learning_rate": 2.6805555555555557e-05,
|
|
"loss": 0.0237,
|
|
"num_tokens": 495028.0,
|
|
"reward": 1.942160725593567,
|
|
"reward_std": 0.36202511191368103,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.29648277163505554,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6684750318527222,
|
|
"rewards/env_reward/std": 0.24135003983974457,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.17416666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.25935015082359314,
|
|
"kl": 0.5315433144569397,
|
|
"learning_rate": 2.6666666666666667e-05,
|
|
"loss": 0.0213,
|
|
"num_tokens": 497488.0,
|
|
"reward": 2.0421676635742188,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4617558717727661,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.40459999442100525,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.167694091796875,
|
|
"kl": 0.49413691461086273,
|
|
"learning_rate": 2.652777777777778e-05,
|
|
"loss": 0.0198,
|
|
"num_tokens": 499928.0,
|
|
"reward": 1.8865565061569214,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2970854640007019,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.6302000284194946,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.17583333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.009997401386499405,
|
|
"kl": 0.4289768636226654,
|
|
"learning_rate": 2.6388888888888892e-05,
|
|
"loss": 0.0172,
|
|
"num_tokens": 502368.0,
|
|
"reward": 2.1709790229797363,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3181929886341095,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7775999903678894,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.17666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.784295558929443,
|
|
"kl": 0.4447771683335304,
|
|
"learning_rate": 2.625e-05,
|
|
"loss": 0.0178,
|
|
"num_tokens": 504808.0,
|
|
"reward": 1.6073195934295654,
|
|
"reward_std": 0.05555550381541252,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3476565480232239,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.34290000796318054,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1775,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.017436975613236427,
|
|
"kl": 0.4346160590648651,
|
|
"learning_rate": 2.6111111111111114e-05,
|
|
"loss": 0.0174,
|
|
"num_tokens": 507248.0,
|
|
"reward": 1.607659101486206,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2377697229385376,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5629000067710876,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.17833333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.183658599853516,
|
|
"kl": 0.6210581138730049,
|
|
"learning_rate": 2.5972222222222226e-05,
|
|
"loss": 0.0248,
|
|
"num_tokens": 509202.0,
|
|
"reward": -0.27445781230926514,
|
|
"reward_std": 3.327219009399414,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.18266406655311584,
|
|
"rewards/belief_accuracy/std": 0.25510939955711365,
|
|
"rewards/env_reward/mean": -0.5482999682426453,
|
|
"rewards/env_reward/std": 1.6436470746994019,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 214
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.17916666666666667,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0025398649740964174,
|
|
"kl": 0.5393896698951721,
|
|
"learning_rate": 2.5833333333333336e-05,
|
|
"loss": 0.0216,
|
|
"num_tokens": 511642.0,
|
|
"reward": 0.9280062317848206,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3102187514305115,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.035100001841783524,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.18,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7643086910247803,
|
|
"kl": 0.5103324726223946,
|
|
"learning_rate": 2.5694444444444445e-05,
|
|
"loss": 0.0204,
|
|
"num_tokens": 514082.0,
|
|
"reward": 1.2094720602035522,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.34864068031311035,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.07569999992847443,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.18083333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.647409439086914,
|
|
"kl": 0.8914579898118973,
|
|
"learning_rate": 2.5555555555555554e-05,
|
|
"loss": 0.0357,
|
|
"num_tokens": 515817.0,
|
|
"reward": 1.9434584379196167,
|
|
"reward_std": 0.26885563135147095,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.41240280866622925,
|
|
"rewards/belief_accuracy/std": 0.046603914350271225,
|
|
"rewards/env_reward/mean": 0.4375,
|
|
"rewards/env_reward/std": 0.22679999470710754,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.18166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.6908979415893555,
|
|
"kl": 0.5759952142834663,
|
|
"learning_rate": 2.5416666666666667e-05,
|
|
"loss": 0.023,
|
|
"num_tokens": 518257.0,
|
|
"reward": 1.0635591745376587,
|
|
"reward_std": 0.055555541068315506,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3233530819416046,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.028999999165534973,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1825,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.2558979988098145,
|
|
"kl": 0.39761848002672195,
|
|
"learning_rate": 2.527777777777778e-05,
|
|
"loss": 0.0159,
|
|
"num_tokens": 520698.0,
|
|
"reward": 1.4361964464187622,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44556546211242676,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.032999999821186066,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.18333333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.562455177307129,
|
|
"kl": 0.3410550318658352,
|
|
"learning_rate": 2.513888888888889e-05,
|
|
"loss": 0.0136,
|
|
"num_tokens": 523160.0,
|
|
"reward": 1.1828861236572266,
|
|
"reward_std": 0.05555546283721924,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.366778701543808,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.021700000390410423,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.18416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 31.85221290588379,
|
|
"kl": 1.1215265393257141,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 0.0449,
|
|
"num_tokens": 525606.0,
|
|
"reward": 1.623335361480713,
|
|
"reward_std": 0.07827307283878326,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.43429505825042725,
|
|
"rewards/belief_accuracy/std": 0.026091037318110466,
|
|
"rewards/env_reward/mean": 0.18029999732971191,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.185,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.04103556647896767,
|
|
"kl": 0.6520499587059021,
|
|
"learning_rate": 2.4861111111111114e-05,
|
|
"loss": 0.0261,
|
|
"num_tokens": 528046.0,
|
|
"reward": 1.719389796257019,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3248632550239563,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.46320000290870667,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 11.333333969116211,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.18583333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 25.01701545715332,
|
|
"kl": 0.5154426582157612,
|
|
"learning_rate": 2.4722222222222223e-05,
|
|
"loss": 0.0206,
|
|
"num_tokens": 530512.0,
|
|
"reward": 1.1061065196990967,
|
|
"reward_std": 0.09622509032487869,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3286855220794678,
|
|
"rewards/belief_accuracy/std": 0.03207503259181976,
|
|
"rewards/env_reward/mean": 0.04670000076293945,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 14.5,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.18666666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 21.70404052734375,
|
|
"kl": 0.6355895921587944,
|
|
"learning_rate": 2.4583333333333332e-05,
|
|
"loss": 0.0254,
|
|
"num_tokens": 532970.0,
|
|
"reward": 1.7521222829818726,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44232407212257385,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.2500999867916107,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6001179218292236,
|
|
"kl": 0.509816125035286,
|
|
"learning_rate": 2.4444444444444445e-05,
|
|
"loss": 0.0204,
|
|
"num_tokens": 535410.0,
|
|
"reward": 1.56881582736969,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37087196111679077,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.27079999446868896,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.18833333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.272892951965332,
|
|
"kl": 0.7432731539011002,
|
|
"learning_rate": 2.4305555555555558e-05,
|
|
"loss": 0.0297,
|
|
"num_tokens": 537862.0,
|
|
"reward": 1.6716521978378296,
|
|
"reward_std": 0.282707542181015,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3669257164001465,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.34724998474121094,
|
|
"rewards/env_reward/std": 0.20989999175071716,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.18916666666666668,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.01726493611931801,
|
|
"kl": 0.46494513750076294,
|
|
"learning_rate": 2.4166666666666667e-05,
|
|
"loss": 0.0186,
|
|
"num_tokens": 540302.0,
|
|
"reward": 0.8716050386428833,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32821834087371826,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.10869999974966049,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 13.75,
|
|
"completions/mean_terminated_length": 13.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.8162057399749756,
|
|
"kl": 0.9498792588710785,
|
|
"learning_rate": 2.402777777777778e-05,
|
|
"loss": 0.038,
|
|
"num_tokens": 542757.0,
|
|
"reward": 2.149008274078369,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3575194478034973,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6843000054359436,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19083333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.008917681872844696,
|
|
"kl": 0.48936086893081665,
|
|
"learning_rate": 2.3888888888888892e-05,
|
|
"loss": 0.0196,
|
|
"num_tokens": 545197.0,
|
|
"reward": 1.225246787071228,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3493489623069763,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.08479999750852585,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19166666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.427980422973633,
|
|
"kl": 0.6280895695090294,
|
|
"learning_rate": 2.375e-05,
|
|
"loss": 0.0251,
|
|
"num_tokens": 547644.0,
|
|
"reward": 1.8005164861679077,
|
|
"reward_std": 0.025754213333129883,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3572555184364319,
|
|
"rewards/belief_accuracy/std": 0.008584737777709961,
|
|
"rewards/env_reward/mean": 0.45249998569488525,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1925,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.009933769702911377,
|
|
"kl": 0.6541983485221863,
|
|
"learning_rate": 2.361111111111111e-05,
|
|
"loss": 0.0262,
|
|
"num_tokens": 550084.0,
|
|
"reward": 1.3975412845611572,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.23698042333126068,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4244000017642975,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19333333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 7.480578899383545,
|
|
"kl": 1.1603283882141113,
|
|
"learning_rate": 2.3472222222222223e-05,
|
|
"loss": 0.0464,
|
|
"num_tokens": 552528.0,
|
|
"reward": 1.1860084533691406,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3575194478034973,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.04230000078678131,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19416666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.052133798599243,
|
|
"kl": 0.8705815225839615,
|
|
"learning_rate": 2.3333333333333336e-05,
|
|
"loss": 0.0348,
|
|
"num_tokens": 554260.0,
|
|
"reward": 0.02076089382171631,
|
|
"reward_std": 3.515756607055664,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.21635779738426208,
|
|
"rewards/belief_accuracy/std": 0.28025466203689575,
|
|
"rewards/env_reward/mean": -0.4188750088214874,
|
|
"rewards/env_reward/std": 1.720750093460083,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 233
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.195,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.004772593267261982,
|
|
"kl": 0.6416133642196655,
|
|
"learning_rate": 2.3194444444444445e-05,
|
|
"loss": 0.0257,
|
|
"num_tokens": 556700.0,
|
|
"reward": 2.1188900470733643,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3552300035953522,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6687999963760376,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 13.25,
|
|
"completions/mean_terminated_length": 13.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19583333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.5717759132385254,
|
|
"kl": 0.6454999148845673,
|
|
"learning_rate": 2.3055555555555558e-05,
|
|
"loss": 0.0258,
|
|
"num_tokens": 559153.0,
|
|
"reward": 2.3321728706359863,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.45852428674697876,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6043999791145325,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19666666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0032116181682795286,
|
|
"kl": 0.5121519565582275,
|
|
"learning_rate": 2.2916666666666667e-05,
|
|
"loss": 0.0205,
|
|
"num_tokens": 561593.0,
|
|
"reward": 1.2194421291351318,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3623640537261963,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.05490000173449516,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 20.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 12.5,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1975,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.710559844970703,
|
|
"kl": 0.8115118741989136,
|
|
"learning_rate": 2.277777777777778e-05,
|
|
"loss": 0.0325,
|
|
"num_tokens": 563535.0,
|
|
"reward": 2.505500316619873,
|
|
"reward_std": 0.45341095328330994,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24253758788108826,
|
|
"rewards/belief_accuracy/std": 0.03546026349067688,
|
|
"rewards/env_reward/mean": 1.151924967765808,
|
|
"rewards/env_reward/std": 0.281749963760376,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19833333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0054448568262159824,
|
|
"kl": 0.6401026844978333,
|
|
"learning_rate": 2.263888888888889e-05,
|
|
"loss": 0.0256,
|
|
"num_tokens": 565975.0,
|
|
"reward": 1.4314594268798828,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3922864496707916,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1363999992609024,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.19916666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.48050761222839355,
|
|
"kl": 0.6103616803884506,
|
|
"learning_rate": 2.25e-05,
|
|
"loss": 0.0244,
|
|
"num_tokens": 568418.0,
|
|
"reward": 1.5885515213012695,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2559005320072174,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5138999819755554,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.010801396332681179,
|
|
"kl": 0.5299305319786072,
|
|
"learning_rate": 2.2361111111111114e-05,
|
|
"loss": 0.0212,
|
|
"num_tokens": 570858.0,
|
|
"reward": 1.7121706008911133,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44220685958862305,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.22370000183582306,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.20083333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.005526633467525244,
|
|
"kl": 0.647596538066864,
|
|
"learning_rate": 2.2222222222222223e-05,
|
|
"loss": 0.0259,
|
|
"num_tokens": 573298.0,
|
|
"reward": 1.6188621520996094,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.25235405564308167,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5411999821662903,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.20166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6879804134368896,
|
|
"kl": 0.4897921085357666,
|
|
"learning_rate": 2.2083333333333333e-05,
|
|
"loss": 0.0196,
|
|
"num_tokens": 575744.0,
|
|
"reward": 1.4994770288467407,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3609590232372284,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.2443999946117401,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2025,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.002229674719274044,
|
|
"kl": 0.5125965476036072,
|
|
"learning_rate": 2.1944444444444445e-05,
|
|
"loss": 0.0205,
|
|
"num_tokens": 578184.0,
|
|
"reward": 1.6194214820861816,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44499048590660095,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.15629999339580536,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.20333333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.019604183733463287,
|
|
"kl": 0.488160103559494,
|
|
"learning_rate": 2.1805555555555558e-05,
|
|
"loss": 0.0195,
|
|
"num_tokens": 580624.0,
|
|
"reward": 1.335284948348999,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2332783341407776,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3903000056743622,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.20416666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.400581359863281,
|
|
"kl": 0.5316231548786163,
|
|
"learning_rate": 2.1666666666666667e-05,
|
|
"loss": 0.0213,
|
|
"num_tokens": 583070.0,
|
|
"reward": 1.7589713335037231,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37080714106559753,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.3977000117301941,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.205,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.09725174307823181,
|
|
"kl": 0.49825286865234375,
|
|
"learning_rate": 2.152777777777778e-05,
|
|
"loss": 0.0199,
|
|
"num_tokens": 585374.0,
|
|
"reward": 2.471735954284668,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.35569527745246887,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9031000137329102,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.20583333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0035387862008064985,
|
|
"kl": 0.6661872863769531,
|
|
"learning_rate": 2.138888888888889e-05,
|
|
"loss": 0.0266,
|
|
"num_tokens": 587814.0,
|
|
"reward": 1.4029350280761719,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2332783341407776,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.43540000915527344,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.20666666666666667,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.028803575783967972,
|
|
"kl": 0.5254324078559875,
|
|
"learning_rate": 2.125e-05,
|
|
"loss": 0.021,
|
|
"num_tokens": 590254.0,
|
|
"reward": 1.569321632385254,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37139055132865906,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2700999975204468,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2075,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0019563257228583097,
|
|
"kl": 0.4938497245311737,
|
|
"learning_rate": 2.111111111111111e-05,
|
|
"loss": 0.0198,
|
|
"num_tokens": 592694.0,
|
|
"reward": 1.2491594552993774,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3335198163986206,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1324000060558319,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.20833333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.028957560658454895,
|
|
"kl": 0.6057292819023132,
|
|
"learning_rate": 2.0972222222222223e-05,
|
|
"loss": 0.0242,
|
|
"num_tokens": 595134.0,
|
|
"reward": 1.4489554166793823,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37641847133636475,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.17980000376701355,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.20916666666666667,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.007945528253912926,
|
|
"kl": 0.5903597474098206,
|
|
"learning_rate": 2.0833333333333336e-05,
|
|
"loss": 0.0236,
|
|
"num_tokens": 597574.0,
|
|
"reward": 1.621120572090149,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44220685958862305,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.16300000250339508,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.21,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0038991058245301247,
|
|
"kl": 0.5373863577842712,
|
|
"learning_rate": 2.0694444444444445e-05,
|
|
"loss": 0.0215,
|
|
"num_tokens": 600014.0,
|
|
"reward": 1.0271950960159302,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2731817066669464,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.10509999841451645,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.21083333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.581563949584961,
|
|
"kl": 0.48605240881443024,
|
|
"learning_rate": 2.0555555555555555e-05,
|
|
"loss": 0.0194,
|
|
"num_tokens": 602456.0,
|
|
"reward": 1.0788397789001465,
|
|
"reward_std": 0.08333337306976318,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3147966265678406,
|
|
"rewards/belief_accuracy/std": 0.027777792885899544,
|
|
"rewards/env_reward/mean": 0.056299999356269836,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.21166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.765892505645752,
|
|
"kl": 0.4648255370557308,
|
|
"learning_rate": 2.0416666666666667e-05,
|
|
"loss": 0.0186,
|
|
"num_tokens": 604756.0,
|
|
"reward": 1.3159242868423462,
|
|
"reward_std": 0.05952895060181618,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.20506228506565094,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.4338250160217285,
|
|
"rewards/env_reward/std": 0.043549999594688416,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.353157043457031,
|
|
"kl": 3.2804705798625946,
|
|
"learning_rate": 2.027777777777778e-05,
|
|
"loss": 0.1312,
|
|
"num_tokens": 606484.0,
|
|
"reward": 1.7571148872375488,
|
|
"reward_std": 0.19245007634162903,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.25628823041915894,
|
|
"rewards/belief_accuracy/std": 0.06415002793073654,
|
|
"rewards/env_reward/mean": 0.6255000233650208,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.21333333333333335,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.006790816783905029,
|
|
"kl": 0.6560365557670593,
|
|
"learning_rate": 2.013888888888889e-05,
|
|
"loss": 0.0262,
|
|
"num_tokens": 608924.0,
|
|
"reward": 2.025508403778076,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3939528167247772,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5291000008583069,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.21416666666666667,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.008915513753890991,
|
|
"kl": 0.7423607707023621,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0297,
|
|
"num_tokens": 611364.0,
|
|
"reward": 1.8160593509674072,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3922864496707916,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3928000032901764,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.215,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.008568666875362396,
|
|
"kl": 0.5796423554420471,
|
|
"learning_rate": 1.986111111111111e-05,
|
|
"loss": 0.0232,
|
|
"num_tokens": 613804.0,
|
|
"reward": 1.3682849407196045,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2332783341407776,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4122999906539917,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.21583333333333332,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.015973366796970367,
|
|
"kl": 0.6325451135635376,
|
|
"learning_rate": 1.9722222222222224e-05,
|
|
"loss": 0.0253,
|
|
"num_tokens": 616244.0,
|
|
"reward": 1.995861291885376,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.396670401096344,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5038999915122986,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.21666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.546822547912598,
|
|
"kl": 1.5618795603513718,
|
|
"learning_rate": 1.9583333333333333e-05,
|
|
"loss": 0.0625,
|
|
"num_tokens": 618696.0,
|
|
"reward": 1.645643949508667,
|
|
"reward_std": 0.2840888798236847,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3621312975883484,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.3395000100135803,
|
|
"rewards/env_reward/std": 0.1738000065088272,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.100569248199463,
|
|
"kl": 0.5200706869363785,
|
|
"learning_rate": 1.9444444444444445e-05,
|
|
"loss": 0.0208,
|
|
"num_tokens": 621136.0,
|
|
"reward": 2.323005437850952,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.33726853132247925,
|
|
"rewards/belief_accuracy/std": 0.02138333022594452,
|
|
"rewards/env_reward/mean": 0.8407999873161316,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.21833333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.2731825113296509,
|
|
"kl": 1.8271799832582474,
|
|
"learning_rate": 1.9305555555555558e-05,
|
|
"loss": 0.0731,
|
|
"num_tokens": 623576.0,
|
|
"reward": 1.965355634689331,
|
|
"reward_std": 0.11111104488372803,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37815189361572266,
|
|
"rewards/belief_accuracy/std": 0.03703702986240387,
|
|
"rewards/env_reward/mean": 0.5206000208854675,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 9.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.21916666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.086411952972412,
|
|
"kl": 0.7842119336128235,
|
|
"learning_rate": 1.9166666666666667e-05,
|
|
"loss": 0.0314,
|
|
"num_tokens": 625897.0,
|
|
"reward": 1.0389503240585327,
|
|
"reward_std": 0.5677647590637207,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24253758788108826,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.17422500252723694,
|
|
"rewards/env_reward/std": 0.36454999446868896,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 14.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.22,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 24.3664608001709,
|
|
"kl": 0.8927547335624695,
|
|
"learning_rate": 1.9027777777777776e-05,
|
|
"loss": 0.0357,
|
|
"num_tokens": 628354.0,
|
|
"reward": 1.613203763961792,
|
|
"reward_std": 0.11111104488372803,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3737679123878479,
|
|
"rewards/belief_accuracy/std": 0.03703702986240387,
|
|
"rewards/env_reward/mean": 0.2946000099182129,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.22083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.769376754760742,
|
|
"kl": 0.6946739554405212,
|
|
"learning_rate": 1.888888888888889e-05,
|
|
"loss": 0.0278,
|
|
"num_tokens": 630816.0,
|
|
"reward": 2.1883459091186523,
|
|
"reward_std": 0.005174954887479544,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3575194478034973,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7105250358581543,
|
|
"rewards/env_reward/std": 0.0034499764442443848,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.22166666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.612954616546631,
|
|
"kl": 0.45268136262893677,
|
|
"learning_rate": 1.8750000000000002e-05,
|
|
"loss": 0.0181,
|
|
"num_tokens": 633256.0,
|
|
"reward": 1.149276852607727,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3394756317138672,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.05389999970793724,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7529609203338623,
|
|
"kl": 0.2946026027202606,
|
|
"learning_rate": 1.861111111111111e-05,
|
|
"loss": 0.0118,
|
|
"num_tokens": 635696.0,
|
|
"reward": 1.2038955688476562,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28013184666633606,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.20900000631809235,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.22333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.185886383056641,
|
|
"kl": 0.4079572707414627,
|
|
"learning_rate": 1.8472222222222224e-05,
|
|
"loss": 0.0163,
|
|
"num_tokens": 638136.0,
|
|
"reward": 1.3853931427001953,
|
|
"reward_std": 0.04227517917752266,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.29433107376098633,
|
|
"rewards/belief_accuracy/std": 0.014091731980443,
|
|
"rewards/env_reward/mean": 0.30160000920295715,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 10.333333969116211,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.22416666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.883063793182373,
|
|
"kl": 0.3966846764087677,
|
|
"learning_rate": 1.8333333333333333e-05,
|
|
"loss": 0.0159,
|
|
"num_tokens": 640599.0,
|
|
"reward": 1.9803223609924316,
|
|
"reward_std": 0.13127277791500092,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.39204078912734985,
|
|
"rewards/belief_accuracy/std": 0.04375755414366722,
|
|
"rewards/env_reward/mean": 0.5027999877929688,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5628440380096436,
|
|
"kl": 0.5333509296178818,
|
|
"learning_rate": 1.8194444444444445e-05,
|
|
"loss": 0.0213,
|
|
"num_tokens": 643039.0,
|
|
"reward": 1.0145008563995361,
|
|
"reward_std": 0.06414999067783356,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4062002897262573,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": -0.16940000653266907,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.22583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8872599601745605,
|
|
"kl": 0.3496560603380203,
|
|
"learning_rate": 1.8055555555555555e-05,
|
|
"loss": 0.014,
|
|
"num_tokens": 645479.0,
|
|
"reward": 1.097575068473816,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3297416567802429,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.03889999911189079,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.22666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.416809558868408,
|
|
"kl": 0.5263015776872635,
|
|
"learning_rate": 1.7916666666666667e-05,
|
|
"loss": 0.0211,
|
|
"num_tokens": 647919.0,
|
|
"reward": 1.7342491149902344,
|
|
"reward_std": 0.30089032649993896,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31531640887260437,
|
|
"rewards/belief_accuracy/std": 0.10029676556587219,
|
|
"rewards/env_reward/mean": 0.49219998717308044,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 13.5,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2275,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 30.599613189697266,
|
|
"kl": 0.7304297238588333,
|
|
"learning_rate": 1.777777777777778e-05,
|
|
"loss": 0.0292,
|
|
"num_tokens": 650373.0,
|
|
"reward": 1.357532262802124,
|
|
"reward_std": 0.07906243950128555,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2865440845489502,
|
|
"rewards/belief_accuracy/std": 0.026354150846600533,
|
|
"rewards/env_reward/mean": 0.2985999882221222,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.22833333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.20082256197929382,
|
|
"kl": 0.854352742433548,
|
|
"learning_rate": 1.763888888888889e-05,
|
|
"loss": 0.0342,
|
|
"num_tokens": 652835.0,
|
|
"reward": 0.8573161959648132,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3652553856372833,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.1923000067472458,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.22916666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.556239128112793,
|
|
"kl": 2.3944233655929565,
|
|
"learning_rate": 1.75e-05,
|
|
"loss": 0.0958,
|
|
"num_tokens": 654947.0,
|
|
"reward": 3.2761619091033936,
|
|
"reward_std": 0.23893354833126068,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.338287353515625,
|
|
"rewards/belief_accuracy/std": 0.07964453846216202,
|
|
"rewards/env_reward/mean": 1.4742000102996826,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.03813393786549568,
|
|
"kl": 0.5205760598182678,
|
|
"learning_rate": 1.736111111111111e-05,
|
|
"loss": 0.0208,
|
|
"num_tokens": 657387.0,
|
|
"reward": 0.9992444515228271,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2742648124694824,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.08429999649524689,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23083333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.15289942920207977,
|
|
"kl": 0.33351561427116394,
|
|
"learning_rate": 1.7222222222222224e-05,
|
|
"loss": 0.0133,
|
|
"num_tokens": 659827.0,
|
|
"reward": 1.8367189168930054,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3825729489326477,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4259999990463257,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.329577445983887,
|
|
"kl": 0.5746472030878067,
|
|
"learning_rate": 1.7083333333333333e-05,
|
|
"loss": 0.023,
|
|
"num_tokens": 661781.0,
|
|
"reward": 1.5449297428131104,
|
|
"reward_std": 0.18311955034732819,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3728598952293396,
|
|
"rewards/belief_accuracy/std": 0.061039846390485764,
|
|
"rewards/env_reward/mean": 0.250900000333786,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2325,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.014571048319339752,
|
|
"kl": 0.6032420992851257,
|
|
"learning_rate": 1.6944444444444446e-05,
|
|
"loss": 0.0241,
|
|
"num_tokens": 664221.0,
|
|
"reward": 1.3711457252502441,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.338581919670105,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.20360000431537628,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.0654096603393555,
|
|
"kl": 0.43393784016370773,
|
|
"learning_rate": 1.6805555555555558e-05,
|
|
"loss": 0.0174,
|
|
"num_tokens": 666661.0,
|
|
"reward": 2.640883445739746,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3465277850627899,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 1.0341999530792236,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23416666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.75065279006958,
|
|
"kl": 0.5964707285165787,
|
|
"learning_rate": 1.6666666666666667e-05,
|
|
"loss": 0.0239,
|
|
"num_tokens": 669102.0,
|
|
"reward": 0.99439537525177,
|
|
"reward_std": 0.055555522441864014,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2916484773159027,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.046300001442432404,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.235,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.148177623748779,
|
|
"kl": 0.5318765938282013,
|
|
"learning_rate": 1.6527777777777777e-05,
|
|
"loss": 0.0213,
|
|
"num_tokens": 670830.0,
|
|
"reward": 2.7913737297058105,
|
|
"reward_std": 0.24002742767333984,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.19624130427837372,
|
|
"rewards/belief_accuracy/std": 0.08000914752483368,
|
|
"rewards/env_reward/mean": 1.4350999593734741,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23583333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6992576122283936,
|
|
"kl": 0.5698364973068237,
|
|
"learning_rate": 1.638888888888889e-05,
|
|
"loss": 0.0228,
|
|
"num_tokens": 673270.0,
|
|
"reward": 1.100367784500122,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2708725929260254,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.15850000083446503,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.40875244140625,
|
|
"kl": 0.41486937925219536,
|
|
"learning_rate": 1.6250000000000002e-05,
|
|
"loss": 0.0166,
|
|
"num_tokens": 675598.0,
|
|
"reward": 1.5371708869934082,
|
|
"reward_std": 0.02686941623687744,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32056111097335815,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.35032498836517334,
|
|
"rewards/env_reward/std": 0.05494999885559082,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8202176094055176,
|
|
"kl": 0.45439669489860535,
|
|
"learning_rate": 1.6111111111111115e-05,
|
|
"loss": 0.0182,
|
|
"num_tokens": 678038.0,
|
|
"reward": 2.5910556316375732,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.33726853132247925,
|
|
"rewards/belief_accuracy/std": 0.02138333022594452,
|
|
"rewards/env_reward/mean": 1.0195000171661377,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23833333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.09877491742372513,
|
|
"kl": 0.2931655943393707,
|
|
"learning_rate": 1.597222222222222e-05,
|
|
"loss": 0.0117,
|
|
"num_tokens": 680478.0,
|
|
"reward": 2.147361993789673,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.461570680141449,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.47510001063346863,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.23916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.7789435386657715,
|
|
"kl": 0.4500267207622528,
|
|
"learning_rate": 1.5833333333333333e-05,
|
|
"loss": 0.018,
|
|
"num_tokens": 682918.0,
|
|
"reward": 1.4638057947158813,
|
|
"reward_std": 0.055555541068315506,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4030686020851135,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.1363999992609024,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.197296380996704,
|
|
"kl": 0.2741189934313297,
|
|
"learning_rate": 1.5694444444444446e-05,
|
|
"loss": 0.011,
|
|
"num_tokens": 685358.0,
|
|
"reward": 1.9038467407226562,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2554989457130432,
|
|
"rewards/belief_accuracy/std": 0.021383339539170265,
|
|
"rewards/env_reward/mean": 0.7249000072479248,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24083333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 13.399361610412598,
|
|
"kl": 1.2592113390564919,
|
|
"learning_rate": 1.5555555555555555e-05,
|
|
"loss": 0.0504,
|
|
"num_tokens": 687804.0,
|
|
"reward": 1.0195121765136719,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2638707160949707,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11860000342130661,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.401183605194092,
|
|
"kl": 0.4531232714653015,
|
|
"learning_rate": 1.5416666666666668e-05,
|
|
"loss": 0.0181,
|
|
"num_tokens": 690250.0,
|
|
"reward": 1.837684154510498,
|
|
"reward_std": 0.06414999067783356,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.30634474754333496,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.5791000127792358,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2425,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.968140125274658,
|
|
"kl": 1.0730007663369179,
|
|
"learning_rate": 1.527777777777778e-05,
|
|
"loss": 0.0429,
|
|
"num_tokens": 692720.0,
|
|
"reward": 0.6950130462646484,
|
|
"reward_std": 0.1111110970377922,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3014543652534485,
|
|
"rewards/belief_accuracy/std": 0.03703703731298447,
|
|
"rewards/env_reward/mean": -0.1729000061750412,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24333333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.9793334007263184,
|
|
"kl": 0.43434225767850876,
|
|
"learning_rate": 1.5138888888888888e-05,
|
|
"loss": 0.0174,
|
|
"num_tokens": 695164.0,
|
|
"reward": 1.4266362190246582,
|
|
"reward_std": 0.30089032649993896,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3198787569999695,
|
|
"rewards/belief_accuracy/std": 0.10029677301645279,
|
|
"rewards/env_reward/mean": 0.27799999713897705,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 32.97469711303711,
|
|
"kl": 0.35747910663485527,
|
|
"learning_rate": 1.5e-05,
|
|
"loss": 0.0143,
|
|
"num_tokens": 697626.0,
|
|
"reward": 0.824181318283081,
|
|
"reward_std": 0.11453071981668472,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2282104194164276,
|
|
"rewards/belief_accuracy/std": 0.03817690536379814,
|
|
"rewards/env_reward/mean": 0.059700001031160355,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.245,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7478795051574707,
|
|
"kl": 0.8649156019091606,
|
|
"learning_rate": 1.4861111111111111e-05,
|
|
"loss": 0.0346,
|
|
"num_tokens": 700066.0,
|
|
"reward": 1.763013482093811,
|
|
"reward_std": 0.24866671860218048,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3802044987678528,
|
|
"rewards/belief_accuracy/std": 0.08288891613483429,
|
|
"rewards/env_reward/mean": 0.3815999925136566,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24583333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.457376956939697,
|
|
"kl": 0.4137156940996647,
|
|
"learning_rate": 1.4722222222222224e-05,
|
|
"loss": 0.0165,
|
|
"num_tokens": 702550.0,
|
|
"reward": 0.9406732320785522,
|
|
"reward_std": 0.055555522441864014,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.39694103598594666,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": -0.20010000467300415,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0129573345184326,
|
|
"kl": 0.43357478082180023,
|
|
"learning_rate": 1.4583333333333335e-05,
|
|
"loss": 0.0173,
|
|
"num_tokens": 704990.0,
|
|
"reward": 1.8325889110565186,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.41454631090164185,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.35929998755455017,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 14.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2475,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.825785636901855,
|
|
"kl": 0.4190758764743805,
|
|
"learning_rate": 1.4444444444444444e-05,
|
|
"loss": 0.0168,
|
|
"num_tokens": 707466.0,
|
|
"reward": 1.711467981338501,
|
|
"reward_std": 0.2151748389005661,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3586101531982422,
|
|
"rewards/belief_accuracy/std": 0.011781362816691399,
|
|
"rewards/env_reward/mean": 0.3904249966144562,
|
|
"rewards/env_reward/std": 0.12855000793933868,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24833333333333332,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5629937648773193,
|
|
"kl": 0.5514194816350937,
|
|
"learning_rate": 1.4305555555555555e-05,
|
|
"loss": 0.0221,
|
|
"num_tokens": 709906.0,
|
|
"reward": 1.3375012874603271,
|
|
"reward_std": 0.024810949340462685,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.34271708130836487,
|
|
"rewards/belief_accuracy/std": 0.008270323276519775,
|
|
"rewards/env_reward/mean": 0.1729000061750412,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.24916666666666668,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.018586158752441,
|
|
"kl": 0.4000145494937897,
|
|
"learning_rate": 1.4166666666666668e-05,
|
|
"loss": 0.016,
|
|
"num_tokens": 712368.0,
|
|
"reward": 2.0728981494903564,
|
|
"reward_std": 0.07826922833919525,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.436899334192276,
|
|
"rewards/belief_accuracy/std": 0.026089724153280258,
|
|
"rewards/env_reward/mean": 0.4747999906539917,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.438758373260498,
|
|
"kl": 0.3966354578733444,
|
|
"learning_rate": 1.4027777777777779e-05,
|
|
"loss": 0.0159,
|
|
"num_tokens": 714808.0,
|
|
"reward": 1.0465856790542603,
|
|
"reward_std": 0.06300617754459381,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31399524211883545,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.036400001496076584,
|
|
"rewards/env_reward/std": 0.01100000087171793,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25083333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.277829170227051,
|
|
"kl": 0.46993373334407806,
|
|
"learning_rate": 1.388888888888889e-05,
|
|
"loss": 0.0188,
|
|
"num_tokens": 717248.0,
|
|
"reward": 1.1199527978897095,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.339000940322876,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.03530000150203705,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25166666666666665,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.5459198951721191,
|
|
"kl": 0.4824730008840561,
|
|
"learning_rate": 1.3750000000000002e-05,
|
|
"loss": 0.0193,
|
|
"num_tokens": 719692.0,
|
|
"reward": 1.5162436962127686,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.36154788732528687,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.25440001487731934,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2525,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4194753170013428,
|
|
"kl": 0.4619780480861664,
|
|
"learning_rate": 1.3611111111111111e-05,
|
|
"loss": 0.0185,
|
|
"num_tokens": 721652.0,
|
|
"reward": 1.910367727279663,
|
|
"reward_std": 0.192450150847435,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2708725929260254,
|
|
"rewards/belief_accuracy/std": 0.06415002793073654,
|
|
"rewards/env_reward/mean": 0.6984999775886536,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25333333333333335,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.019419284537434578,
|
|
"kl": 0.45894408226013184,
|
|
"learning_rate": 1.3472222222222222e-05,
|
|
"loss": 0.0184,
|
|
"num_tokens": 724092.0,
|
|
"reward": 1.2206957340240479,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.338581919670105,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.10329999774694443,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25416666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1239161491394043,
|
|
"kl": 0.6261979192495346,
|
|
"learning_rate": 1.3333333333333333e-05,
|
|
"loss": 0.025,
|
|
"num_tokens": 726554.0,
|
|
"reward": 1.5227115154266357,
|
|
"reward_std": 0.11586559563875198,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.33021634817123413,
|
|
"rewards/belief_accuracy/std": 0.03546027094125748,
|
|
"rewards/env_reward/mean": 0.3213750123977661,
|
|
"rewards/env_reward/std": 0.04535000026226044,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 13.75,
|
|
"completions/mean_terminated_length": 13.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.255,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 194.4773406982422,
|
|
"kl": 0.6245425343513489,
|
|
"learning_rate": 1.3194444444444446e-05,
|
|
"loss": 0.025,
|
|
"num_tokens": 729009.0,
|
|
"reward": 1.3024067878723145,
|
|
"reward_std": 0.034763336181640625,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3904189467430115,
|
|
"rewards/belief_accuracy/std": 0.011587768793106079,
|
|
"rewards/env_reward/mean": 0.054099999368190765,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25583333333333336,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1589040756225586,
|
|
"kl": 0.33902474492788315,
|
|
"learning_rate": 1.3055555555555557e-05,
|
|
"loss": 0.0136,
|
|
"num_tokens": 731449.0,
|
|
"reward": 1.2196178436279297,
|
|
"reward_std": 0.06414999067783356,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2708725929260254,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.23800000548362732,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25666666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6437861919403076,
|
|
"kl": 0.6563503816723824,
|
|
"learning_rate": 1.2916666666666668e-05,
|
|
"loss": 0.0263,
|
|
"num_tokens": 733749.0,
|
|
"reward": 1.4235941171646118,
|
|
"reward_std": 0.07300972193479538,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3557646870613098,
|
|
"rewards/belief_accuracy/std": 0.02433657832443714,
|
|
"rewards/env_reward/mean": 0.20419999957084656,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2575,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.04555347189307213,
|
|
"kl": 0.42974987626075745,
|
|
"learning_rate": 1.2777777777777777e-05,
|
|
"loss": 0.0172,
|
|
"num_tokens": 736189.0,
|
|
"reward": 1.604121446609497,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44499048590660095,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1460999995470047,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25833333333333336,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.02474827878177166,
|
|
"kl": 0.7871623039245605,
|
|
"learning_rate": 1.263888888888889e-05,
|
|
"loss": 0.0315,
|
|
"num_tokens": 738629.0,
|
|
"reward": 1.3761944770812988,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4330648183822632,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.017999999225139618,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.25916666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0570595264434814,
|
|
"kl": 0.3533485382795334,
|
|
"learning_rate": 1.25e-05,
|
|
"loss": 0.0141,
|
|
"num_tokens": 741069.0,
|
|
"reward": 1.1065177917480469,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2708725929260254,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.16259999573230743,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.26,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 139.93460083007812,
|
|
"kl": 0.678299218416214,
|
|
"learning_rate": 1.2361111111111112e-05,
|
|
"loss": 0.0271,
|
|
"num_tokens": 743531.0,
|
|
"reward": 1.6729682683944702,
|
|
"reward_std": 0.08333329111337662,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2697893977165222,
|
|
"rewards/belief_accuracy/std": 0.0277777761220932,
|
|
"rewards/env_reward/mean": 0.5424000024795532,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2608333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2570526599884033,
|
|
"kl": 0.4118284657597542,
|
|
"learning_rate": 1.2222222222222222e-05,
|
|
"loss": 0.0165,
|
|
"num_tokens": 745971.0,
|
|
"reward": 2.1772122383117676,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3459707498550415,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.7261999845504761,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.26166666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.618257761001587,
|
|
"kl": 0.38145381957292557,
|
|
"learning_rate": 1.2083333333333333e-05,
|
|
"loss": 0.0153,
|
|
"num_tokens": 748412.0,
|
|
"reward": 1.8299237489700317,
|
|
"reward_std": 0.09756175428628922,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.38340792059898376,
|
|
"rewards/belief_accuracy/std": 0.03252057731151581,
|
|
"rewards/env_reward/mean": 0.4198000133037567,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.187060356140137,
|
|
"kl": 0.5387748070061207,
|
|
"learning_rate": 1.1944444444444446e-05,
|
|
"loss": 0.0216,
|
|
"num_tokens": 750546.0,
|
|
"reward": 2.673368453979492,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.261056125164032,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 1.2267999649047852,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2633333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7571438550949097,
|
|
"kl": 0.5994225069880486,
|
|
"learning_rate": 1.1805555555555555e-05,
|
|
"loss": 0.024,
|
|
"num_tokens": 752986.0,
|
|
"reward": 1.0582326650619507,
|
|
"reward_std": 0.055555541068315506,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37947753071784973,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": -0.0868000015616417,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.26416666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.014529568143188953,
|
|
"kl": 0.601597785949707,
|
|
"learning_rate": 1.1666666666666668e-05,
|
|
"loss": 0.0241,
|
|
"num_tokens": 755426.0,
|
|
"reward": 1.4333548545837402,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.37021827697753906,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1817999929189682,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.265,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.389315366744995,
|
|
"kl": 0.7737997323274612,
|
|
"learning_rate": 1.1527777777777779e-05,
|
|
"loss": 0.0309,
|
|
"num_tokens": 757154.0,
|
|
"reward": 3.960230827331543,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.408660352230072,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 1.7894999980926514,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2658333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.118506669998169,
|
|
"kl": 1.1970821470022202,
|
|
"learning_rate": 1.138888888888889e-05,
|
|
"loss": 0.0479,
|
|
"num_tokens": 759594.0,
|
|
"reward": 1.4754880666732788,
|
|
"reward_std": 0.1484978199005127,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.35916271805763245,
|
|
"rewards/belief_accuracy/std": 0.020031657069921494,
|
|
"rewards/env_reward/mean": 0.23199999332427979,
|
|
"rewards/env_reward/std": 0.07039999961853027,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.26666666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.015209314413368702,
|
|
"kl": 0.6464384198188782,
|
|
"learning_rate": 1.125e-05,
|
|
"loss": 0.0259,
|
|
"num_tokens": 762034.0,
|
|
"reward": 1.3930593729019165,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3922864496707916,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11079999804496765,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 12.25,
|
|
"completions/mean_terminated_length": 12.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2675,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 161.41494750976562,
|
|
"kl": 1.1002038344740868,
|
|
"learning_rate": 1.1111111111111112e-05,
|
|
"loss": 0.044,
|
|
"num_tokens": 764483.0,
|
|
"reward": 1.8057804107666016,
|
|
"reward_std": 0.03534410521388054,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3586101531982422,
|
|
"rewards/belief_accuracy/std": 0.011781362816691399,
|
|
"rewards/env_reward/mean": 0.45329999923706055,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2683333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.044717781245708466,
|
|
"kl": 0.40166932344436646,
|
|
"learning_rate": 1.0972222222222223e-05,
|
|
"loss": 0.0161,
|
|
"num_tokens": 766923.0,
|
|
"reward": 1.76560640335083,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.42471882700920105,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.29429998993873596,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.26916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5786283016204834,
|
|
"kl": 0.4470795914530754,
|
|
"learning_rate": 1.0833333333333334e-05,
|
|
"loss": 0.0179,
|
|
"num_tokens": 769365.0,
|
|
"reward": 1.5160024166107178,
|
|
"reward_std": 0.030062079429626465,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.40731751918792725,
|
|
"rewards/belief_accuracy/std": 0.010020703077316284,
|
|
"rewards/env_reward/mean": 0.16269999742507935,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.27,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7102688550949097,
|
|
"kl": 0.6622214764356613,
|
|
"learning_rate": 1.0694444444444444e-05,
|
|
"loss": 0.0265,
|
|
"num_tokens": 771477.0,
|
|
"reward": 1.6042454242706299,
|
|
"reward_std": 0.035124778747558594,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.34043174982070923,
|
|
"rewards/belief_accuracy/std": 0.011708259582519531,
|
|
"rewards/env_reward/mean": 0.35530000925064087,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2708333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.304865837097168,
|
|
"kl": 0.3590489625930786,
|
|
"learning_rate": 1.0555555555555555e-05,
|
|
"loss": 0.0144,
|
|
"num_tokens": 773917.0,
|
|
"reward": 1.5945935249328613,
|
|
"reward_std": 0.048784732818603516,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4178144931793213,
|
|
"rewards/belief_accuracy/std": 0.016261577606201172,
|
|
"rewards/env_reward/mean": 0.1941000074148178,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 13.25,
|
|
"completions/mean_terminated_length": 13.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.27166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4902801513671875,
|
|
"kl": 0.4370143413543701,
|
|
"learning_rate": 1.0416666666666668e-05,
|
|
"loss": 0.0175,
|
|
"num_tokens": 775862.0,
|
|
"reward": 2.3113110065460205,
|
|
"reward_std": 0.05054942145943642,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4199703335762024,
|
|
"rewards/belief_accuracy/std": 0.016849856823682785,
|
|
"rewards/env_reward/mean": 0.6675999760627747,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2725,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2359626293182373,
|
|
"kl": 0.7615313455462456,
|
|
"learning_rate": 1.0277777777777777e-05,
|
|
"loss": 0.0305,
|
|
"num_tokens": 778302.0,
|
|
"reward": 0.8328565359115601,
|
|
"reward_std": 0.11773625016212463,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.24623969197273254,
|
|
"rewards/belief_accuracy/std": 0.018518514931201935,
|
|
"rewards/env_reward/mean": 0.02942500077188015,
|
|
"rewards/env_reward/std": 0.05795000120997429,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2733333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.020049810409546,
|
|
"kl": 0.3303503468632698,
|
|
"learning_rate": 1.013888888888889e-05,
|
|
"loss": 0.0132,
|
|
"num_tokens": 780742.0,
|
|
"reward": 2.2166571617126465,
|
|
"reward_std": 0.06414992362260818,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.27441903948783875,
|
|
"rewards/belief_accuracy/std": 0.02138333022594452,
|
|
"rewards/env_reward/mean": 0.8956000208854675,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.27416666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1073155403137207,
|
|
"kl": 0.8350988179445267,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.0334,
|
|
"num_tokens": 783182.0,
|
|
"reward": 1.357542634010315,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.265547513961792,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.34060001373291016,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.275,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.754615306854248,
|
|
"kl": 0.42683811113238335,
|
|
"learning_rate": 9.861111111111112e-06,
|
|
"loss": 0.0171,
|
|
"num_tokens": 785622.0,
|
|
"reward": 1.3142235279083252,
|
|
"reward_std": 0.0679774284362793,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3334578275680542,
|
|
"rewards/belief_accuracy/std": 0.022659126669168472,
|
|
"rewards/env_reward/mean": 0.17589999735355377,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2758333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.232966899871826,
|
|
"kl": 0.5274177491664886,
|
|
"learning_rate": 9.722222222222223e-06,
|
|
"loss": 0.0211,
|
|
"num_tokens": 788062.0,
|
|
"reward": 1.4996254444122314,
|
|
"reward_std": 0.0008034308557398617,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2752085030078888,
|
|
"rewards/belief_accuracy/std": 0.0002678185701370239,
|
|
"rewards/env_reward/mean": 0.41600000858306885,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.27666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.217159271240234,
|
|
"kl": 0.5627310574054718,
|
|
"learning_rate": 9.583333333333334e-06,
|
|
"loss": 0.0225,
|
|
"num_tokens": 790524.0,
|
|
"reward": 2.103834390640259,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.33671149611473083,
|
|
"rewards/belief_accuracy/std": 0.02138333022594452,
|
|
"rewards/env_reward/mean": 0.6958000063896179,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2775,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.021253760904073715,
|
|
"kl": 0.5807605981826782,
|
|
"learning_rate": 9.444444444444445e-06,
|
|
"loss": 0.0232,
|
|
"num_tokens": 792964.0,
|
|
"reward": 2.2022147178649902,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4087548851966858,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6172999739646912,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2783333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 21.313928604125977,
|
|
"kl": 0.4485930949449539,
|
|
"learning_rate": 9.305555555555555e-06,
|
|
"loss": 0.0179,
|
|
"num_tokens": 795448.0,
|
|
"reward": 1.7761070728302002,
|
|
"reward_std": 0.06414999067783356,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.27441903948783875,
|
|
"rewards/belief_accuracy/std": 0.02138333022594452,
|
|
"rewards/env_reward/mean": 0.6018999814987183,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2791666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7311512231826782,
|
|
"kl": 0.43019605800509453,
|
|
"learning_rate": 9.166666666666666e-06,
|
|
"loss": 0.0172,
|
|
"num_tokens": 797888.0,
|
|
"reward": 1.3519158363342285,
|
|
"reward_std": 0.047978997230529785,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3684219717979431,
|
|
"rewards/belief_accuracy/std": 0.01599299907684326,
|
|
"rewards/env_reward/mean": 0.13109999895095825,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.28,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.104975461959839,
|
|
"kl": 0.35861632227897644,
|
|
"learning_rate": 9.027777777777777e-06,
|
|
"loss": 0.0143,
|
|
"num_tokens": 800328.0,
|
|
"reward": 1.887249231338501,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3800663948059082,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.46470001339912415,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2808333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.03680419921875,
|
|
"kl": 0.4628690183162689,
|
|
"learning_rate": 8.88888888888889e-06,
|
|
"loss": 0.0185,
|
|
"num_tokens": 802768.0,
|
|
"reward": 1.1183212995529175,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32095709443092346,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.07029999792575836,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2816666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9502252340316772,
|
|
"kl": 0.3435991369187832,
|
|
"learning_rate": 8.75e-06,
|
|
"loss": 0.0137,
|
|
"num_tokens": 805208.0,
|
|
"reward": 2.2443795204162598,
|
|
"reward_std": 0.05555550381541252,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2651597857475281,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.9326000213623047,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2825,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 48.597938537597656,
|
|
"kl": 0.8074730969965458,
|
|
"learning_rate": 8.611111111111112e-06,
|
|
"loss": 0.0323,
|
|
"num_tokens": 807670.0,
|
|
"reward": 1.3689297437667847,
|
|
"reward_std": 0.13127261400222778,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3153432607650757,
|
|
"rewards/belief_accuracy/std": 0.04375755414366722,
|
|
"rewards/env_reward/mean": 0.24860000610351562,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2833333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.296276092529297,
|
|
"kl": 0.34323835372924805,
|
|
"learning_rate": 8.472222222222223e-06,
|
|
"loss": 0.0137,
|
|
"num_tokens": 810110.0,
|
|
"reward": 1.5166881084442139,
|
|
"reward_std": 0.05555550381541252,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.39799603819847107,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.1817999929189682,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2841666666666667,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.026873953640460968,
|
|
"kl": 0.5060014128684998,
|
|
"learning_rate": 8.333333333333334e-06,
|
|
"loss": 0.0202,
|
|
"num_tokens": 812550.0,
|
|
"reward": 1.6035584211349487,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3939528167247772,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.24779999256134033,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.285,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4969699382781982,
|
|
"kl": 0.47678476572036743,
|
|
"learning_rate": 8.194444444444445e-06,
|
|
"loss": 0.0191,
|
|
"num_tokens": 814990.0,
|
|
"reward": 1.1913644075393677,
|
|
"reward_std": 0.055555541068315506,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3531047999858856,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.05469999834895134,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.28583333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3507494926452637,
|
|
"kl": 0.3678537532687187,
|
|
"learning_rate": 8.055555555555557e-06,
|
|
"loss": 0.0147,
|
|
"num_tokens": 816918.0,
|
|
"reward": 2.290696620941162,
|
|
"reward_std": 0.0907217413187027,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3493489623069763,
|
|
"rewards/belief_accuracy/std": 0.030240608379244804,
|
|
"rewards/env_reward/mean": 0.7950999736785889,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2866666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.588015556335449,
|
|
"kl": 0.3631714880466461,
|
|
"learning_rate": 7.916666666666667e-06,
|
|
"loss": 0.0145,
|
|
"num_tokens": 819358.0,
|
|
"reward": 1.9951159954071045,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4264719784259796,
|
|
"rewards/belief_accuracy/std": 0.02138333022594452,
|
|
"rewards/env_reward/mean": 0.4438000023365021,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 31.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 31.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 15.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.8612518310546875,
|
|
"kl": 0.43969496712088585,
|
|
"learning_rate": 7.777777777777777e-06,
|
|
"loss": 0.0176,
|
|
"num_tokens": 821819.0,
|
|
"reward": 1.826906681060791,
|
|
"reward_std": 0.05555546283721924,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32745224237442017,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.529699981212616,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.28833333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2849005162715912,
|
|
"kl": 0.3736693486571312,
|
|
"learning_rate": 7.63888888888889e-06,
|
|
"loss": 0.0149,
|
|
"num_tokens": 824264.0,
|
|
"reward": 1.1083310842514038,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3253270387649536,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.05490000173449516,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2891666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.451597213745117,
|
|
"kl": 0.4166445955634117,
|
|
"learning_rate": 7.5e-06,
|
|
"loss": 0.0167,
|
|
"num_tokens": 826706.0,
|
|
"reward": 1.4393980503082275,
|
|
"reward_std": 0.0824948102235794,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2972326874732971,
|
|
"rewards/belief_accuracy/std": 0.0274982750415802,
|
|
"rewards/env_reward/mean": 0.33180001378059387,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.29,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6531128883361816,
|
|
"kl": 0.3893243670463562,
|
|
"learning_rate": 7.361111111111112e-06,
|
|
"loss": 0.0156,
|
|
"num_tokens": 829146.0,
|
|
"reward": 1.5187160968780518,
|
|
"reward_std": 0.06414999067783356,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.35287201404571533,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.2734000086784363,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.29083333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.793511867523193,
|
|
"kl": 0.36402036249637604,
|
|
"learning_rate": 7.222222222222222e-06,
|
|
"loss": 0.0146,
|
|
"num_tokens": 831591.0,
|
|
"reward": 1.270379662513733,
|
|
"reward_std": 0.13127264380455017,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3153432607650757,
|
|
"rewards/belief_accuracy/std": 0.04375755414366722,
|
|
"rewards/env_reward/mean": 0.18289999663829803,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2916666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.456671714782715,
|
|
"kl": 0.25079457089304924,
|
|
"learning_rate": 7.083333333333334e-06,
|
|
"loss": 0.01,
|
|
"num_tokens": 834036.0,
|
|
"reward": 2.6385445594787598,
|
|
"reward_std": 0.08333325386047363,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3418981432914734,
|
|
"rewards/belief_accuracy/std": 0.0277777761220932,
|
|
"rewards/env_reward/mean": 1.0419000387191772,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2925,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2002947330474854,
|
|
"kl": 0.31729260832071304,
|
|
"learning_rate": 6.944444444444445e-06,
|
|
"loss": 0.0127,
|
|
"num_tokens": 836476.0,
|
|
"reward": 1.7575269937515259,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.38932567834854126,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.3596999943256378,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.29333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.2136647701263428,
|
|
"kl": 0.7196188867092133,
|
|
"learning_rate": 6.805555555555556e-06,
|
|
"loss": 0.0288,
|
|
"num_tokens": 838916.0,
|
|
"reward": 0.9425899386405945,
|
|
"reward_std": 0.166666641831398,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.273129940032959,
|
|
"rewards/belief_accuracy/std": 0.055555544793605804,
|
|
"rewards/env_reward/mean": 0.04879999905824661,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2941666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.321444034576416,
|
|
"kl": 0.44010917842388153,
|
|
"learning_rate": 6.666666666666667e-06,
|
|
"loss": 0.0176,
|
|
"num_tokens": 841360.0,
|
|
"reward": 1.9293063879013062,
|
|
"reward_std": 0.07850543409585953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.39598548412323,
|
|
"rewards/belief_accuracy/std": 0.026168476790189743,
|
|
"rewards/env_reward/mean": 0.4609000086784363,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.295,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.029545234516263008,
|
|
"kl": 0.4735727906227112,
|
|
"learning_rate": 6.5277777777777784e-06,
|
|
"loss": 0.0189,
|
|
"num_tokens": 843800.0,
|
|
"reward": 1.641128659248352,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28782621026039124,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.48510000109672546,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.29583333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6239774227142334,
|
|
"kl": 0.25653597339987755,
|
|
"learning_rate": 6.3888888888888885e-06,
|
|
"loss": 0.0103,
|
|
"num_tokens": 846240.0,
|
|
"reward": 1.8896186351776123,
|
|
"reward_std": 0.04136645793914795,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4032561779022217,
|
|
"rewards/belief_accuracy/std": 0.013788819313049316,
|
|
"rewards/env_reward/mean": 0.41990000009536743,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2966666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.297626495361328,
|
|
"kl": 0.3872426562011242,
|
|
"learning_rate": 6.25e-06,
|
|
"loss": 0.0155,
|
|
"num_tokens": 848566.0,
|
|
"reward": 1.280221700668335,
|
|
"reward_std": 0.06414999067783356,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.38377392292022705,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.05260000005364418,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2975,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9092109203338623,
|
|
"kl": 0.4036319889128208,
|
|
"learning_rate": 6.111111111111111e-06,
|
|
"loss": 0.0161,
|
|
"num_tokens": 851006.0,
|
|
"reward": 1.3718723058700562,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2835240662097931,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.3142000138759613,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.29833333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.07863418012857437,
|
|
"kl": 0.5231885313987732,
|
|
"learning_rate": 5.972222222222223e-06,
|
|
"loss": 0.0209,
|
|
"num_tokens": 853122.0,
|
|
"reward": 1.8864500522613525,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3187500238418579,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5867999792098999,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2991666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.9344420433044434,
|
|
"kl": 0.3685489371418953,
|
|
"learning_rate": 5.833333333333334e-06,
|
|
"loss": 0.0147,
|
|
"num_tokens": 855562.0,
|
|
"reward": 1.580292820930481,
|
|
"reward_std": 0.12299706041812897,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2708725929260254,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.4784500002861023,
|
|
"rewards/env_reward/std": 0.049500007182359695,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.04218378663063049,
|
|
"kl": 0.30064401030540466,
|
|
"learning_rate": 5.694444444444445e-06,
|
|
"loss": 0.012,
|
|
"num_tokens": 858002.0,
|
|
"reward": 1.2759324312210083,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3579941391944885,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.10130000114440918,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.30083333333333334,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.05267792195081711,
|
|
"kl": 0.6117653846740723,
|
|
"learning_rate": 5.555555555555556e-06,
|
|
"loss": 0.0245,
|
|
"num_tokens": 860442.0,
|
|
"reward": 0.7257359027862549,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.35569527745246887,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.26089999079704285,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3016666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.316222190856934,
|
|
"kl": 4.106600508093834,
|
|
"learning_rate": 5.416666666666667e-06,
|
|
"loss": 0.1643,
|
|
"num_tokens": 862554.0,
|
|
"reward": 1.7996174097061157,
|
|
"reward_std": 0.026703864336013794,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3517557978630066,
|
|
"rewards/belief_accuracy/std": 0.00890129804611206,
|
|
"rewards/env_reward/mean": 0.4629000127315521,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9306823015213013,
|
|
"kl": 0.3761134594678879,
|
|
"learning_rate": 5.277777777777778e-06,
|
|
"loss": 0.015,
|
|
"num_tokens": 864994.0,
|
|
"reward": 1.980668544769287,
|
|
"reward_std": 0.06415006518363953,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4363061785697937,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.41449999809265137,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.30333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6175270080566406,
|
|
"kl": 0.523816742002964,
|
|
"learning_rate": 5.138888888888889e-06,
|
|
"loss": 0.021,
|
|
"num_tokens": 867434.0,
|
|
"reward": 1.060789942741394,
|
|
"reward_std": 0.10638077557086945,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.27312996983528137,
|
|
"rewards/belief_accuracy/std": 0.03546025604009628,
|
|
"rewards/env_reward/mean": 0.12759999930858612,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.30416666666666664,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.343258023262024,
|
|
"kl": 0.6741083934903145,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.027,
|
|
"num_tokens": 869874.0,
|
|
"reward": 1.2328159809112549,
|
|
"reward_std": 0.047978997230529785,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3684219717979431,
|
|
"rewards/belief_accuracy/std": 0.01599299907684326,
|
|
"rewards/env_reward/mean": 0.05169999971985817,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.305,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.257349729537964,
|
|
"kl": 0.5208408161997795,
|
|
"learning_rate": 4.861111111111111e-06,
|
|
"loss": 0.0208,
|
|
"num_tokens": 872314.0,
|
|
"reward": 1.145273208618164,
|
|
"reward_std": 0.05555550381541252,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.39694103598594666,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": -0.06369999796152115,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.30583333333333335,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.8866859674453735,
|
|
"kl": 0.26471298933029175,
|
|
"learning_rate": 4.722222222222222e-06,
|
|
"loss": 0.0106,
|
|
"num_tokens": 874757.0,
|
|
"reward": 1.2441325187683105,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3579941391944885,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.08009999990463257,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.30666666666666664,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.010123252868652,
|
|
"kl": 3.0728435292840004,
|
|
"learning_rate": 4.583333333333333e-06,
|
|
"loss": 0.1229,
|
|
"num_tokens": 877198.0,
|
|
"reward": -0.9120967388153076,
|
|
"reward_std": 2.8924098014831543,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": 0.1363677680492401,
|
|
"rewards/belief_accuracy/std": 0.22492384910583496,
|
|
"rewards/env_reward/mean": -0.8808000087738037,
|
|
"rewards/env_reward/std": 1.4128000736236572,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 368
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.981584072113037,
|
|
"kl": 0.488948717713356,
|
|
"learning_rate": 4.444444444444445e-06,
|
|
"loss": 0.0196,
|
|
"num_tokens": 879643.0,
|
|
"reward": 1.5771369934082031,
|
|
"reward_std": 0.05555550381541252,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.290595680475235,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.43689998984336853,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.30833333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.590527057647705,
|
|
"kl": 0.37513425201177597,
|
|
"learning_rate": 4.305555555555556e-06,
|
|
"loss": 0.015,
|
|
"num_tokens": 882083.0,
|
|
"reward": 1.8574588298797607,
|
|
"reward_std": 0.05555546283721924,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3345862925052643,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.5357999801635742,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.30916666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3306689262390137,
|
|
"kl": 0.3898431584239006,
|
|
"learning_rate": 4.166666666666667e-06,
|
|
"loss": 0.0156,
|
|
"num_tokens": 884523.0,
|
|
"reward": 2.17402720451355,
|
|
"reward_std": 0.004569530487060547,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4608090817928314,
|
|
"rewards/belief_accuracy/std": 0.0015231966972351074,
|
|
"rewards/env_reward/mean": 0.4943999946117401,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 29.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 29.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 16.25,
|
|
"completions/mean_terminated_length": 16.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.126691818237305,
|
|
"kl": 0.5024303831160069,
|
|
"learning_rate": 4.027777777777779e-06,
|
|
"loss": 0.0201,
|
|
"num_tokens": 886988.0,
|
|
"reward": 2.5307817459106445,
|
|
"reward_std": 0.4320550858974457,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.32745224237442017,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.9989500045776367,
|
|
"rewards/env_reward/std": 0.26506149768829346,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31083333333333335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.256619930267334,
|
|
"kl": 0.3448418974876404,
|
|
"learning_rate": 3.888888888888889e-06,
|
|
"loss": 0.0138,
|
|
"num_tokens": 889428.0,
|
|
"reward": 1.242105484008789,
|
|
"reward_std": 0.05555550381541252,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.38676851987838745,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.021199999377131462,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31166666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.003289222717285,
|
|
"kl": 0.2741120904684067,
|
|
"learning_rate": 3.75e-06,
|
|
"loss": 0.011,
|
|
"num_tokens": 891868.0,
|
|
"reward": 0.7584868669509888,
|
|
"reward_std": 0.06415002793073654,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.20506228506565094,
|
|
"rewards/belief_accuracy/std": 0.02138334885239601,
|
|
"rewards/env_reward/mean": 0.062199998646974564,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.024889945983887,
|
|
"kl": 0.926845133304596,
|
|
"learning_rate": 3.611111111111111e-06,
|
|
"loss": 0.0371,
|
|
"num_tokens": 894330.0,
|
|
"reward": 1.3204090595245361,
|
|
"reward_std": 0.007576584815979004,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3431697189807892,
|
|
"rewards/belief_accuracy/std": 0.00252552330493927,
|
|
"rewards/env_reward/mean": 0.16060000658035278,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31333333333333335,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.01764746382832527,
|
|
"kl": 0.43021830916404724,
|
|
"learning_rate": 3.4722222222222224e-06,
|
|
"loss": 0.0172,
|
|
"num_tokens": 896770.0,
|
|
"reward": 1.6580833196640015,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3960277736186981,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2800000011920929,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31416666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.265389919281006,
|
|
"kl": 0.529280386865139,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 0.0212,
|
|
"num_tokens": 899238.0,
|
|
"reward": 1.660062313079834,
|
|
"reward_std": 0.4098671078681946,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4355999231338501,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.20217500627040863,
|
|
"rewards/env_reward/std": 0.28334999084472656,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 13.75,
|
|
"completions/mean_terminated_length": 13.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.315,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.52178955078125,
|
|
"kl": 0.6352234184741974,
|
|
"learning_rate": 3.1944444444444443e-06,
|
|
"loss": 0.0254,
|
|
"num_tokens": 901693.0,
|
|
"reward": 1.4409255981445312,
|
|
"reward_std": 0.046097397804260254,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4471418261528015,
|
|
"rewards/belief_accuracy/std": 0.015365764498710632,
|
|
"rewards/env_reward/mean": 0.032999999821186066,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31583333333333335,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.8627490997314453,
|
|
"kl": 0.5881659425795078,
|
|
"learning_rate": 3.0555555555555556e-06,
|
|
"loss": 0.0235,
|
|
"num_tokens": 904149.0,
|
|
"reward": 1.5204272270202637,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2753424048423767,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4296000003814697,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31666666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6827125549316406,
|
|
"kl": 0.35598936676979065,
|
|
"learning_rate": 2.916666666666667e-06,
|
|
"loss": 0.0142,
|
|
"num_tokens": 906589.0,
|
|
"reward": 1.3910578489303589,
|
|
"reward_std": 0.021128177642822266,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4228193163871765,
|
|
"rewards/belief_accuracy/std": 0.007042735815048218,
|
|
"rewards/env_reward/mean": 0.04839999973773956,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.160460948944092,
|
|
"kl": 0.578317403793335,
|
|
"learning_rate": 2.777777777777778e-06,
|
|
"loss": 0.0231,
|
|
"num_tokens": 909029.0,
|
|
"reward": 1.2475202083587646,
|
|
"reward_std": 0.031668663024902344,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.36097338795661926,
|
|
"rewards/belief_accuracy/std": 0.010556221008300781,
|
|
"rewards/env_reward/mean": 0.07639999687671661,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31833333333333336,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.413422107696533,
|
|
"kl": 0.47100868076086044,
|
|
"learning_rate": 2.638888888888889e-06,
|
|
"loss": 0.0188,
|
|
"num_tokens": 911474.0,
|
|
"reward": 2.3091464042663574,
|
|
"reward_std": 0.31583333015441895,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.40389877557754517,
|
|
"rewards/belief_accuracy/std": 0.0277777761220932,
|
|
"rewards/env_reward/mean": 0.6983000040054321,
|
|
"rewards/env_reward/std": 0.1550000011920929,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 14.5,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.31916666666666665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.446463584899902,
|
|
"kl": 0.4363628067076206,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 0.0175,
|
|
"num_tokens": 913932.0,
|
|
"reward": 1.9526245594024658,
|
|
"reward_std": 0.05555550381541252,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2647581994533539,
|
|
"rewards/belief_accuracy/std": 0.018518514931201935,
|
|
"rewards/env_reward/mean": 0.7389000058174133,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.32,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.315631151199341,
|
|
"kl": 0.5301463380455971,
|
|
"learning_rate": 2.361111111111111e-06,
|
|
"loss": 0.0212,
|
|
"num_tokens": 916372.0,
|
|
"reward": 0.9895731210708618,
|
|
"reward_std": 0.10638080537319183,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.39694103598594666,
|
|
"rewards/belief_accuracy/std": 0.03546025976538658,
|
|
"rewards/env_reward/mean": -0.16750000417232513,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.32083333333333336,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.120096445083618,
|
|
"kl": 0.45274005830287933,
|
|
"learning_rate": 2.2222222222222225e-06,
|
|
"loss": 0.0181,
|
|
"num_tokens": 918834.0,
|
|
"reward": 1.811830997467041,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3253270387649536,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5238999724388123,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.32166666666666666,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.03539842367172241,
|
|
"kl": 0.320296972990036,
|
|
"learning_rate": 2.0833333333333334e-06,
|
|
"loss": 0.0128,
|
|
"num_tokens": 921274.0,
|
|
"reward": 1.9864510297775269,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.40483367443084717,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4812999963760376,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4435579776763916,
|
|
"kl": 0.33051633834838867,
|
|
"learning_rate": 1.9444444444444444e-06,
|
|
"loss": 0.0132,
|
|
"num_tokens": 923714.0,
|
|
"reward": 1.564325213432312,
|
|
"reward_std": 0.055555541068315506,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3661750555038452,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.27720001339912415,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3233333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.437168836593628,
|
|
"kl": 0.6133978962898254,
|
|
"learning_rate": 1.8055555555555555e-06,
|
|
"loss": 0.0245,
|
|
"num_tokens": 926154.0,
|
|
"reward": 1.0200436115264893,
|
|
"reward_std": 0.09072183817625046,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.36154788732528687,
|
|
"rewards/belief_accuracy/std": 0.03024062141776085,
|
|
"rewards/env_reward/mean": -0.07639999687671661,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.32416666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.590235710144043,
|
|
"kl": 0.4510222151875496,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 0.018,
|
|
"num_tokens": 928601.0,
|
|
"reward": 1.4076029062271118,
|
|
"reward_std": 0.047661781311035156,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4342842996120453,
|
|
"rewards/belief_accuracy/std": 0.01588726043701172,
|
|
"rewards/env_reward/mean": 0.0364999994635582,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.325,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6361474990844727,
|
|
"kl": 0.3107151463627815,
|
|
"learning_rate": 1.5277777777777778e-06,
|
|
"loss": 0.0124,
|
|
"num_tokens": 931041.0,
|
|
"reward": 2.3909270763397217,
|
|
"reward_std": 0.004569530487060547,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4608090817928314,
|
|
"rewards/belief_accuracy/std": 0.0015231966972351074,
|
|
"rewards/env_reward/mean": 0.6389999985694885,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3258333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5152220726013184,
|
|
"kl": 0.6350157707929611,
|
|
"learning_rate": 1.388888888888889e-06,
|
|
"loss": 0.0254,
|
|
"num_tokens": 933482.0,
|
|
"reward": 1.62190842628479,
|
|
"reward_std": 0.003773768898099661,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.44561946392059326,
|
|
"rewards/belief_accuracy/std": 0.0012579113245010376,
|
|
"rewards/env_reward/mean": 0.156700000166893,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.32666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.8789347410202026,
|
|
"kl": 1.3899996429681778,
|
|
"learning_rate": 1.25e-06,
|
|
"loss": 0.0556,
|
|
"num_tokens": 935414.0,
|
|
"reward": 2.7954330444335938,
|
|
"reward_std": 0.8637241721153259,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.4274568557739258,
|
|
"rewards/belief_accuracy/std": 0.032788295298814774,
|
|
"rewards/env_reward/mean": 0.9753749966621399,
|
|
"rewards/env_reward/std": 0.5192500352859497,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3275,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.022810563445091248,
|
|
"kl": 0.275255411863327,
|
|
"learning_rate": 1.1111111111111112e-06,
|
|
"loss": 0.011,
|
|
"num_tokens": 937854.0,
|
|
"reward": 1.2254817485809326,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2821272611618042,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.21940000355243683,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3283333333333333,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.06492399424314499,
|
|
"kl": 0.2405586689710617,
|
|
"learning_rate": 9.722222222222222e-07,
|
|
"loss": 0.0096,
|
|
"num_tokens": 940294.0,
|
|
"reward": 1.2385733127593994,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.28939110040664673,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.21359999477863312,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.32916666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.5544055104255676,
|
|
"kl": 1.3655546456575394,
|
|
"learning_rate": 8.333333333333333e-07,
|
|
"loss": 0.0546,
|
|
"num_tokens": 942734.0,
|
|
"reward": 1.3416773080825806,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3189590871334076,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.2231999933719635,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.33,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.817050576210022,
|
|
"kl": 0.33038008213043213,
|
|
"learning_rate": 6.944444444444445e-07,
|
|
"loss": 0.0132,
|
|
"num_tokens": 945038.0,
|
|
"reward": 0.37761184573173523,
|
|
"reward_std": 0.13981175422668457,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.2454039305448532,
|
|
"rewards/belief_accuracy/std": 0.04660391807556152,
|
|
"rewards/env_reward/mean": -0.27239999175071716,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.3308333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.133925437927246,
|
|
"kl": 0.7797681391239166,
|
|
"learning_rate": 5.555555555555556e-07,
|
|
"loss": 0.0312,
|
|
"num_tokens": 946969.0,
|
|
"reward": 2.6139159202575684,
|
|
"reward_std": 0.7964601516723633,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.31071361899375916,
|
|
"rewards/belief_accuracy/std": 0.0555555634200573,
|
|
"rewards/env_reward/mean": 1.0878499746322632,
|
|
"rewards/env_reward/std": 0.48350000381469727,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.33166666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.173895835876465,
|
|
"kl": 0.4529978558421135,
|
|
"learning_rate": 4.1666666666666667e-07,
|
|
"loss": 0.0181,
|
|
"num_tokens": 949413.0,
|
|
"reward": 1.1765426397323608,
|
|
"reward_std": 0.055555541068315506,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3091141879558563,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.13279999792575836,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3325,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8653059005737305,
|
|
"kl": 0.46213603019714355,
|
|
"learning_rate": 2.777777777777778e-07,
|
|
"loss": 0.0185,
|
|
"num_tokens": 951345.0,
|
|
"reward": 1.7938777208328247,
|
|
"reward_std": 0.05555558204650879,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.3020426034927368,
|
|
"rewards/belief_accuracy/std": 0.018518522381782532,
|
|
"rewards/env_reward/mean": 0.5584999918937683,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.3333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.37034010887146,
|
|
"kl": 0.5379725992679596,
|
|
"learning_rate": 1.388888888888889e-07,
|
|
"loss": 0.0215,
|
|
"num_tokens": 953277.0,
|
|
"reward": 3.2419023513793945,
|
|
"reward_std": 0.06162475422024727,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.440767377614975,
|
|
"rewards/belief_accuracy/std": 0.020541606470942497,
|
|
"rewards/env_reward/mean": 1.246399998664856,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 400
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 400,
|
|
"num_input_tokens_seen": 953277,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 250,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|