8285 lines
296 KiB
JSON
8285 lines
296 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.125,
|
|
"eval_steps": 500,
|
|
"global_step": 250,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 29.0,
|
|
"completions/mean_terminated_length": 20.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.0005,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.32243695855140686,
|
|
"kl": 0.016345822252333164,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0007,
|
|
"num_tokens": 2516.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 29.75,
|
|
"completions/mean_terminated_length": 23.0,
|
|
"completions/min_length": 23.0,
|
|
"completions/min_terminated_length": 23.0,
|
|
"epoch": 0.001,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1738930642604828,
|
|
"kl": 0.0056577762588858604,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 5035.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 27.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.0015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 53.453521728515625,
|
|
"kl": 0.027107596397399902,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 0.0011,
|
|
"num_tokens": 7545.0,
|
|
"reward": -3.724677085876465,
|
|
"reward_std": 2.4506454467773438,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.212284803390503,
|
|
"rewards/env_reward/std": 1.5754303932189941,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 22.0,
|
|
"completions/min_terminated_length": 22.0,
|
|
"epoch": 0.002,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.2984156608581543,
|
|
"kl": 0.013630361296236515,
|
|
"learning_rate": 3e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 10063.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 61.420711517333984,
|
|
"kl": 0.0825746851041913,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 0.0033,
|
|
"num_tokens": 12536.0,
|
|
"reward": -3.895512342453003,
|
|
"reward_std": 2.1089749336242676,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.3261749744415283,
|
|
"rewards/env_reward/std": 1.3476500511169434,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 30.75,
|
|
"completions/mean_terminated_length": 30.33333396911621,
|
|
"completions/min_length": 29.0,
|
|
"completions/min_terminated_length": 29.0,
|
|
"epoch": 0.003,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.032736778259277,
|
|
"kl": 0.0035573970526456833,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 15059.0,
|
|
"reward": -2.084261417388916,
|
|
"reward_std": 3.3090696334838867,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.147840976715088,
|
|
"rewards/env_reward/std": 2.1386890411376953,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0035,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.20612499117851257,
|
|
"kl": 0.007132542319595814,
|
|
"learning_rate": 6e-06,
|
|
"loss": 0.0003,
|
|
"num_tokens": 17568.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 20.25,
|
|
"completions/mean_terminated_length": 20.25,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.004,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.10438346862793,
|
|
"kl": 0.05212839285377413,
|
|
"learning_rate": 7.000000000000001e-06,
|
|
"loss": 0.0021,
|
|
"num_tokens": 20049.0,
|
|
"reward": -3.4786999225616455,
|
|
"reward_std": 2.9425997734069824,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.04830002784729,
|
|
"rewards/env_reward/std": 1.90339994430542,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 28.0,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 0.0045,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.36320337653160095,
|
|
"kl": 0.005910599138587713,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 22569.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 20.75,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.005,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 35.37952423095703,
|
|
"kl": 0.214208863559179,
|
|
"learning_rate": 9e-06,
|
|
"loss": 0.0086,
|
|
"num_tokens": 25052.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0055,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 219.09710693359375,
|
|
"kl": 0.09427966503426433,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.0038,
|
|
"num_tokens": 27536.0,
|
|
"reward": -2.487870216369629,
|
|
"reward_std": 2.853968858718872,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.4169135093688965,
|
|
"rewards/env_reward/std": 1.8355563879013062,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 26.0,
|
|
"completions/mean_terminated_length": 20.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.006,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 5.1641130447387695,
|
|
"kl": 0.02741223480552435,
|
|
"learning_rate": 1.1000000000000001e-05,
|
|
"loss": 0.0011,
|
|
"num_tokens": 30040.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.722599029541016,
|
|
"kl": 0.16925985834677704,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 0.0068,
|
|
"num_tokens": 32510.0,
|
|
"reward": -2.37943172454834,
|
|
"reward_std": 2.9682364463806152,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3446213006973267,
|
|
"rewards/env_reward/std": 1.9114667177200317,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 22.0,
|
|
"completions/min_terminated_length": 22.0,
|
|
"epoch": 0.007,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1022278442978859,
|
|
"kl": 0.006297597661614418,
|
|
"learning_rate": 1.3000000000000001e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 35028.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.0075,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.032320525497198105,
|
|
"kl": 0.002568609546869993,
|
|
"learning_rate": 1.4000000000000001e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 37556.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 22.33333396911621,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.008,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 8.214491844177246,
|
|
"kl": 0.041143732611089945,
|
|
"learning_rate": 1.5e-05,
|
|
"loss": 0.0016,
|
|
"num_tokens": 40055.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.0085,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 39.8804817199707,
|
|
"kl": 0.04003936113440432,
|
|
"learning_rate": 1.6000000000000003e-05,
|
|
"loss": 0.0016,
|
|
"num_tokens": 42531.0,
|
|
"reward": -2.5680184364318848,
|
|
"reward_std": 2.750802993774414,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.11806440353393555,
|
|
"rewards/belief_accuracy/std": 0.03612881526350975,
|
|
"rewards/env_reward/mean": -1.4383834600448608,
|
|
"rewards/env_reward/std": 1.8049958944320679,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 28.25,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.009,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.495725393295288,
|
|
"kl": 0.019086187705397606,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 0.0008,
|
|
"num_tokens": 45044.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 22.75,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.0095,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 26.531925201416016,
|
|
"kl": 0.09912175685167313,
|
|
"learning_rate": 1.8e-05,
|
|
"loss": 0.004,
|
|
"num_tokens": 47535.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.011573791503906,
|
|
"kl": 0.0038544870913028717,
|
|
"learning_rate": 1.9e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 50063.0,
|
|
"reward": -2.320432662963867,
|
|
"reward_std": 3.037968873977661,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.305288553237915,
|
|
"rewards/env_reward/std": 1.9579919576644897,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0105,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 36.5081787109375,
|
|
"kl": 0.2546631218865514,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0102,
|
|
"num_tokens": 52561.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 30.5,
|
|
"completions/mean_terminated_length": 26.0,
|
|
"completions/min_length": 26.0,
|
|
"completions/min_terminated_length": 26.0,
|
|
"epoch": 0.011,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.0799552202224731,
|
|
"kl": 0.009861491620540619,
|
|
"learning_rate": 2.1e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 55083.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0115,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 23.75230598449707,
|
|
"kl": 0.20189414219930768,
|
|
"learning_rate": 2.2000000000000003e-05,
|
|
"loss": 0.0081,
|
|
"num_tokens": 57588.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 23.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.012,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 102.0042953491211,
|
|
"kl": 0.1681511290371418,
|
|
"learning_rate": 2.3000000000000003e-05,
|
|
"loss": 0.0067,
|
|
"num_tokens": 60080.0,
|
|
"reward": -1.766066074371338,
|
|
"reward_std": 2.126420736312866,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.9648774862289429,
|
|
"rewards/env_reward/std": 1.3593891859054565,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0125,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.4423506259918213,
|
|
"kl": 0.0637103128246963,
|
|
"learning_rate": 2.4e-05,
|
|
"loss": 0.0025,
|
|
"num_tokens": 62571.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 31.75,
|
|
"completions/mean_terminated_length": 31.0,
|
|
"completions/min_length": 31.0,
|
|
"completions/min_terminated_length": 31.0,
|
|
"epoch": 0.013,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.05712759494781494,
|
|
"kl": 0.005990173202008009,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 65098.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 28.25,
|
|
"completions/mean_terminated_length": 28.25,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.0135,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1384836584329605,
|
|
"kl": 0.018408390693366528,
|
|
"learning_rate": 2.6000000000000002e-05,
|
|
"loss": 0.0007,
|
|
"num_tokens": 67611.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.014,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.17841196060180664,
|
|
"kl": 0.008233492728322744,
|
|
"learning_rate": 2.7000000000000002e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 70139.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0145,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 14.524484634399414,
|
|
"kl": 0.07956769224256277,
|
|
"learning_rate": 2.8000000000000003e-05,
|
|
"loss": 0.0032,
|
|
"num_tokens": 72647.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 24.25,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.015,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.6826711297035217,
|
|
"kl": 0.05026988545432687,
|
|
"learning_rate": 2.9e-05,
|
|
"loss": 0.002,
|
|
"num_tokens": 75144.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0155,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11050.3515625,
|
|
"kl": 36.80695866746828,
|
|
"learning_rate": 3e-05,
|
|
"loss": 1.4723,
|
|
"num_tokens": 77637.0,
|
|
"reward": -2.443718194961548,
|
|
"reward_std": 2.9238996505737305,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.0741666704416275,
|
|
"rewards/belief_accuracy/std": 0.05166666582226753,
|
|
"rewards/env_reward/mean": -1.443312168121338,
|
|
"rewards/env_reward/std": 1.8071939945220947,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.016,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 5.319654941558838,
|
|
"kl": 0.1096202852204442,
|
|
"learning_rate": 3.1e-05,
|
|
"loss": 0.0044,
|
|
"num_tokens": 80130.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 24.25,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0165,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 28.577579498291016,
|
|
"kl": 0.04399943072348833,
|
|
"learning_rate": 3.2000000000000005e-05,
|
|
"loss": 0.0018,
|
|
"num_tokens": 82627.0,
|
|
"reward": -3.7981131076812744,
|
|
"reward_std": 2.3037734031677246,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.261242151260376,
|
|
"rewards/env_reward/std": 1.477515697479248,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.017,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 3.7505602836608887,
|
|
"kl": 0.04482424072921276,
|
|
"learning_rate": 3.3e-05,
|
|
"loss": 0.0018,
|
|
"num_tokens": 85109.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 16.25,
|
|
"completions/mean_terminated_length": 16.25,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 20.112499237060547,
|
|
"kl": 0.0021229138001217507,
|
|
"learning_rate": 3.4000000000000007e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 87574.0,
|
|
"reward": 0.1572304666042328,
|
|
"reward_std": 0.04570581018924713,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.014735294505953789,
|
|
"rewards/belief_accuracy/std": 0.09565715491771698,
|
|
"rewards/env_reward/mean": 0.10095755755901337,
|
|
"rewards/env_reward/std": 0.20054571330547333,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.018,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.24909250438213348,
|
|
"kl": 0.024185666348785162,
|
|
"learning_rate": 3.5e-05,
|
|
"loss": 0.001,
|
|
"num_tokens": 90072.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 30.75,
|
|
"completions/mean_terminated_length": 27.0,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 0.0185,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.509799003601074,
|
|
"kl": 0.01711271144449711,
|
|
"learning_rate": 3.6e-05,
|
|
"loss": 0.0007,
|
|
"num_tokens": 92595.0,
|
|
"reward": -3.6846251487731934,
|
|
"reward_std": 2.530749559402466,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.1855833530426025,
|
|
"rewards/env_reward/std": 1.6288331747055054,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 29.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 29.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 22.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.019,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.46738389134407043,
|
|
"kl": 0.012128827278502285,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 95085.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 27.5,
|
|
"completions/mean_terminated_length": 26.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.0195,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.194053888320923,
|
|
"kl": 0.039654724299907684,
|
|
"learning_rate": 3.8e-05,
|
|
"loss": 0.0016,
|
|
"num_tokens": 97595.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2551957964897156,
|
|
"kl": 0.02670608414337039,
|
|
"learning_rate": 3.9000000000000006e-05,
|
|
"loss": 0.0011,
|
|
"num_tokens": 100093.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 29.75,
|
|
"completions/mean_terminated_length": 29.0,
|
|
"completions/min_length": 26.0,
|
|
"completions/min_terminated_length": 26.0,
|
|
"epoch": 0.0205,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.4345109760761261,
|
|
"kl": 0.010095613077282906,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 102612.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.021,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.6849669218063354,
|
|
"kl": 0.08905280428007245,
|
|
"learning_rate": 4.1e-05,
|
|
"loss": 0.0036,
|
|
"num_tokens": 105099.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 24.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0215,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 113.153564453125,
|
|
"kl": 0.12807448720559478,
|
|
"learning_rate": 4.2e-05,
|
|
"loss": 0.0051,
|
|
"num_tokens": 107595.0,
|
|
"reward": -2.9136834144592285,
|
|
"reward_std": 2.423197031021118,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.700788974761963,
|
|
"rewards/env_reward/std": 1.5501903295516968,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.022,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1417529582977295,
|
|
"kl": 0.05178070580586791,
|
|
"learning_rate": 4.3e-05,
|
|
"loss": 0.0021,
|
|
"num_tokens": 110123.0,
|
|
"reward": -3.766486167907715,
|
|
"reward_std": 2.3670270442962646,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.2401576042175293,
|
|
"rewards/env_reward/std": 1.5196847915649414,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 14.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 31.787551879882812,
|
|
"kl": 0.1364445798099041,
|
|
"learning_rate": 4.4000000000000006e-05,
|
|
"loss": 0.0055,
|
|
"num_tokens": 112599.0,
|
|
"reward": -2.7055277824401855,
|
|
"reward_std": 2.6139395236968994,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.12398147583007812,
|
|
"rewards/belief_accuracy/std": 0.04796295985579491,
|
|
"rewards/env_reward/mean": -1.5182223320007324,
|
|
"rewards/env_reward/std": 1.736833095550537,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.023,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 809.3003540039062,
|
|
"kl": 1.457309697754681,
|
|
"learning_rate": 4.5e-05,
|
|
"loss": 0.0583,
|
|
"num_tokens": 115104.0,
|
|
"reward": -2.267341136932373,
|
|
"reward_std": 3.0976674556732178,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.2698941230773926,
|
|
"rewards/env_reward/std": 1.9977540969848633,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 14.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0235,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 17.53619384765625,
|
|
"kl": 1.0537898712791502,
|
|
"learning_rate": 4.600000000000001e-05,
|
|
"loss": 0.0422,
|
|
"num_tokens": 117580.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.024,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 187.8842010498047,
|
|
"kl": 0.3121867855079472,
|
|
"learning_rate": 4.7e-05,
|
|
"loss": 0.0125,
|
|
"num_tokens": 120073.0,
|
|
"reward": -3.6274335384368896,
|
|
"reward_std": 2.645132541656494,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.1474556922912598,
|
|
"rewards/env_reward/std": 1.705088496208191,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 28.75,
|
|
"completions/mean_terminated_length": 25.5,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.0245,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.377052307128906,
|
|
"kl": 0.10642453748732805,
|
|
"learning_rate": 4.8e-05,
|
|
"loss": 0.0043,
|
|
"num_tokens": 122588.0,
|
|
"reward": -3.7050957679748535,
|
|
"reward_std": 2.4898080825805664,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.199230432510376,
|
|
"rewards/env_reward/std": 1.6015390157699585,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 30.75,
|
|
"completions/mean_terminated_length": 27.0,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 0.025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.019148826599121,
|
|
"kl": 0.056871576234698296,
|
|
"learning_rate": 4.9e-05,
|
|
"loss": 0.0023,
|
|
"num_tokens": 125111.0,
|
|
"reward": -3.6691508293151855,
|
|
"reward_std": 2.5616979598999023,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.175267219543457,
|
|
"rewards/env_reward/std": 1.6494653224945068,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 29.25,
|
|
"completions/mean_terminated_length": 26.5,
|
|
"completions/min_length": 26.0,
|
|
"completions/min_terminated_length": 26.0,
|
|
"epoch": 0.0255,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.499008178710938,
|
|
"kl": 0.0715335039421916,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0029,
|
|
"num_tokens": 127628.0,
|
|
"reward": -2.420839786529541,
|
|
"reward_std": 2.920422315597534,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.372226595878601,
|
|
"rewards/env_reward/std": 1.8795907497406006,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.026,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.560655117034912,
|
|
"kl": 0.017654206603765488,
|
|
"learning_rate": 4.9888888888888894e-05,
|
|
"loss": 0.0007,
|
|
"num_tokens": 130156.0,
|
|
"reward": -4.051011085510254,
|
|
"reward_std": 1.7979769706726074,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.4298410415649414,
|
|
"rewards/env_reward/std": 1.1403180360794067,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.0265,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.195871353149414,
|
|
"kl": 0.24096931191161275,
|
|
"learning_rate": 4.977777777777778e-05,
|
|
"loss": 0.0096,
|
|
"num_tokens": 132651.0,
|
|
"reward": -2.6885905265808105,
|
|
"reward_std": 2.6208035945892334,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.5507268905639648,
|
|
"rewards/env_reward/std": 1.6801002025604248,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.027,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 123.76203155517578,
|
|
"kl": 5.633732934948057,
|
|
"learning_rate": 4.966666666666667e-05,
|
|
"loss": 0.2253,
|
|
"num_tokens": 135150.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 29.75,
|
|
"completions/mean_terminated_length": 27.5,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 0.0275,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.095720291137695,
|
|
"kl": 0.04261765070259571,
|
|
"learning_rate": 4.955555555555556e-05,
|
|
"loss": 0.0017,
|
|
"num_tokens": 137669.0,
|
|
"reward": -3.5939033031463623,
|
|
"reward_std": 2.712193012237549,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.1251022815704346,
|
|
"rewards/env_reward/std": 1.7497954368591309,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 7.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 7.0,
|
|
"completions/max_terminated_length": 7.0,
|
|
"completions/mean_length": 6.5,
|
|
"completions/mean_terminated_length": 6.5,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.028,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.206400394439697,
|
|
"kl": 0.0018149956013076007,
|
|
"learning_rate": 4.9444444444444446e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 140095.0,
|
|
"reward": -0.6511554718017578,
|
|
"reward_std": 0.4664153754711151,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.2507702708244324,
|
|
"rewards/env_reward/std": 0.3109435439109802,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.0285,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.72509479522705,
|
|
"kl": 0.03556834487244487,
|
|
"learning_rate": 4.933333333333334e-05,
|
|
"loss": 0.0014,
|
|
"num_tokens": 142580.0,
|
|
"reward": -1.1361982822418213,
|
|
"reward_std": 2.5431196689605713,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5449655055999756,
|
|
"rewards/env_reward/std": 1.6370937824249268,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.029,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 69.71238708496094,
|
|
"kl": 0.12905889004468918,
|
|
"learning_rate": 4.922222222222222e-05,
|
|
"loss": 0.0052,
|
|
"num_tokens": 145053.0,
|
|
"reward": -3.697530746459961,
|
|
"reward_std": 2.5049378871917725,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.1941874027252197,
|
|
"rewards/env_reward/std": 1.61162531375885,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0295,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.306075572967529,
|
|
"kl": 0.05293075350346044,
|
|
"learning_rate": 4.9111111111111114e-05,
|
|
"loss": 0.0021,
|
|
"num_tokens": 147515.0,
|
|
"reward": -0.8994538187980652,
|
|
"reward_std": 0.14630256593227386,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.165370374917984,
|
|
"rewards/belief_accuracy/std": 0.04776628687977791,
|
|
"rewards/env_reward/mean": -0.30222848057746887,
|
|
"rewards/env_reward/std": 0.13543139398097992,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.75,
|
|
"completions/mean_terminated_length": 8.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.006049633026123,
|
|
"kl": 0.024995889314595843,
|
|
"learning_rate": 4.9e-05,
|
|
"loss": 0.001,
|
|
"num_tokens": 149950.0,
|
|
"reward": -0.2717297375202179,
|
|
"reward_std": 0.27656516432762146,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.03703703731298447,
|
|
"rewards/belief_accuracy/std": 0.04781460762023926,
|
|
"rewards/env_reward/mean": -0.14041242003440857,
|
|
"rewards/env_reward/std": 0.21106119453907013,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.0305,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.1231383085250854,
|
|
"kl": 0.25201990082859993,
|
|
"learning_rate": 4.888888888888889e-05,
|
|
"loss": 0.0101,
|
|
"num_tokens": 152433.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.031,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.847359657287598,
|
|
"kl": 0.3126356555148959,
|
|
"learning_rate": 4.8777777777777775e-05,
|
|
"loss": 0.0125,
|
|
"num_tokens": 154901.0,
|
|
"reward": -1.7808257341384888,
|
|
"reward_std": 3.659447193145752,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.9455505609512329,
|
|
"rewards/env_reward/std": 2.372274160385132,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0315,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.686541557312012,
|
|
"kl": 0.153579062782228,
|
|
"learning_rate": 4.866666666666667e-05,
|
|
"loss": 0.0061,
|
|
"num_tokens": 157399.0,
|
|
"reward": -2.4691736698150635,
|
|
"reward_std": 2.869215488433838,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.404449224472046,
|
|
"rewards/env_reward/std": 1.845564842224121,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 20.33333396911621,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.032,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 92.96546936035156,
|
|
"kl": 0.2878706678748131,
|
|
"learning_rate": 4.855555555555556e-05,
|
|
"loss": 0.0115,
|
|
"num_tokens": 159892.0,
|
|
"reward": -3.7632439136505127,
|
|
"reward_std": 2.373511791229248,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.2379961013793945,
|
|
"rewards/env_reward/std": 1.5240079164505005,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 27.5,
|
|
"completions/mean_terminated_length": 26.0,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.0325,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 36.191368103027344,
|
|
"kl": 0.35182441864162683,
|
|
"learning_rate": 4.844444444444445e-05,
|
|
"loss": 0.0141,
|
|
"num_tokens": 162402.0,
|
|
"reward": -2.2611498832702637,
|
|
"reward_std": 3.114436149597168,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10736748576164246,
|
|
"rewards/belief_accuracy/std": 0.014734972268342972,
|
|
"rewards/env_reward/mean": -1.255198359489441,
|
|
"rewards/env_reward/std": 2.0199925899505615,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 16.75,
|
|
"completions/mean_terminated_length": 11.666666984558105,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.033,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 228884.828125,
|
|
"kl": 468.09647609852254,
|
|
"learning_rate": 4.8333333333333334e-05,
|
|
"loss": 18.7239,
|
|
"num_tokens": 164869.0,
|
|
"reward": -3.877704381942749,
|
|
"reward_std": 2.1445908546447754,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.314302921295166,
|
|
"rewards/env_reward/std": 1.3713939189910889,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 22.0,
|
|
"completions/min_terminated_length": 22.0,
|
|
"epoch": 0.0335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.94432258605957,
|
|
"kl": 0.21442949026823044,
|
|
"learning_rate": 4.8222222222222225e-05,
|
|
"loss": 0.0086,
|
|
"num_tokens": 167387.0,
|
|
"reward": -3.75144624710083,
|
|
"reward_std": 2.397106885910034,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.230130910873413,
|
|
"rewards/env_reward/std": 1.5397380590438843,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 8.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.034,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.818193435668945,
|
|
"kl": 0.9112066635861993,
|
|
"learning_rate": 4.811111111111111e-05,
|
|
"loss": 0.0364,
|
|
"num_tokens": 169620.0,
|
|
"reward": 0.4229079484939575,
|
|
"reward_std": 0.2314292937517166,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.24860529601573944,
|
|
"rewards/env_reward/std": 0.154286190867424,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.0,
|
|
"completions/mean_terminated_length": 13.333333969116211,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0345,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 25.480619430541992,
|
|
"kl": 2.5357193499803543,
|
|
"learning_rate": 4.8e-05,
|
|
"loss": 0.1014,
|
|
"num_tokens": 172092.0,
|
|
"reward": -2.201890468597412,
|
|
"reward_std": 3.173243284225464,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.2262604236602783,
|
|
"rewards/env_reward/std": 2.04813814163208,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 25.0,
|
|
"completions/mean_terminated_length": 22.666667938232422,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.035,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.468518257141113,
|
|
"kl": 0.5803861692547798,
|
|
"learning_rate": 4.7888888888888886e-05,
|
|
"loss": 0.0232,
|
|
"num_tokens": 174592.0,
|
|
"reward": -2.594465732574463,
|
|
"reward_std": 2.7201738357543945,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.0878773033618927,
|
|
"rewards/belief_accuracy/std": 0.024245386943221092,
|
|
"rewards/env_reward/mean": -1.5163891315460205,
|
|
"rewards/env_reward/std": 1.7145698070526123,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 25.0,
|
|
"completions/mean_terminated_length": 22.666667938232422,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0355,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 46.88726806640625,
|
|
"kl": 0.7163544222712517,
|
|
"learning_rate": 4.7777777777777784e-05,
|
|
"loss": 0.0287,
|
|
"num_tokens": 177092.0,
|
|
"reward": -2.0240089893341064,
|
|
"reward_std": 3.3790602684020996,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.1076725721359253,
|
|
"rewards/env_reward/std": 2.1853580474853516,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.036,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.72477388381958,
|
|
"kl": 0.7021452663466334,
|
|
"learning_rate": 4.766666666666667e-05,
|
|
"loss": 0.0281,
|
|
"num_tokens": 179581.0,
|
|
"reward": -1.4259536266326904,
|
|
"reward_std": 2.3681116104125977,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7381357550621033,
|
|
"rewards/env_reward/std": 1.5208872556686401,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0365,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 26.196685791015625,
|
|
"kl": 0.6366847828030586,
|
|
"learning_rate": 4.755555555555556e-05,
|
|
"loss": 0.0255,
|
|
"num_tokens": 182076.0,
|
|
"reward": -1.664202332496643,
|
|
"reward_std": 2.311755418777466,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08500000089406967,
|
|
"rewards/belief_accuracy/std": 0.030000001192092896,
|
|
"rewards/env_reward/mean": -0.9311348795890808,
|
|
"rewards/env_reward/std": 1.4874457120895386,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.037,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 38.14860916137695,
|
|
"kl": 1.0752212293446064,
|
|
"learning_rate": 4.7444444444444445e-05,
|
|
"loss": 0.043,
|
|
"num_tokens": 184544.0,
|
|
"reward": -1.49713134765625,
|
|
"reward_std": 2.301912307739258,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7855876684188843,
|
|
"rewards/env_reward/std": 1.476274847984314,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.0375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1271560192108154,
|
|
"kl": 0.22124752588570118,
|
|
"learning_rate": 4.7333333333333336e-05,
|
|
"loss": 0.0088,
|
|
"num_tokens": 187072.0,
|
|
"reward": -2.7508177757263184,
|
|
"reward_std": 2.5399067401885986,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.5922119617462158,
|
|
"rewards/env_reward/std": 1.6259276866912842,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.038,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.755521774291992,
|
|
"kl": 0.16726691462099552,
|
|
"learning_rate": 4.722222222222222e-05,
|
|
"loss": 0.0067,
|
|
"num_tokens": 189600.0,
|
|
"reward": -3.8879446983337402,
|
|
"reward_std": 2.124109983444214,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.3211300373077393,
|
|
"rewards/env_reward/std": 1.357740044593811,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 27.0,
|
|
"completions/min_length": 23.0,
|
|
"completions/min_terminated_length": 23.0,
|
|
"epoch": 0.0385,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.24942930042743683,
|
|
"kl": 0.789710771292448,
|
|
"learning_rate": 4.711111111111111e-05,
|
|
"loss": 0.0316,
|
|
"num_tokens": 192108.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 16.75,
|
|
"completions/mean_terminated_length": 11.666666984558105,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.039,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.9244184494018555,
|
|
"kl": 1.1887651532888412,
|
|
"learning_rate": 4.7e-05,
|
|
"loss": 0.0476,
|
|
"num_tokens": 194575.0,
|
|
"reward": -1.6062259674072266,
|
|
"reward_std": 2.2595274448394775,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08573612570762634,
|
|
"rewards/belief_accuracy/std": 0.028527740389108658,
|
|
"rewards/env_reward/mean": -0.8910117149353027,
|
|
"rewards/env_reward/std": 1.4291024208068848,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 26.5,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0395,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.589632034301758,
|
|
"kl": 0.914489395916462,
|
|
"learning_rate": 4.6888888888888895e-05,
|
|
"loss": 0.0366,
|
|
"num_tokens": 197081.0,
|
|
"reward": -1.3840163946151733,
|
|
"reward_std": 2.4131362438201904,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.681010901927948,
|
|
"rewards/env_reward/std": 1.562245488166809,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 25.75,
|
|
"completions/mean_terminated_length": 19.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.04,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 23.94391441345215,
|
|
"kl": 0.7511968985199928,
|
|
"learning_rate": 4.677777777777778e-05,
|
|
"loss": 0.03,
|
|
"num_tokens": 199584.0,
|
|
"reward": -1.7008922100067139,
|
|
"reward_std": 2.489635705947876,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.9214280843734741,
|
|
"rewards/env_reward/std": 1.6092621088027954,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0405,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.980483055114746,
|
|
"kl": 1.2917132005095482,
|
|
"learning_rate": 4.666666666666667e-05,
|
|
"loss": 0.0517,
|
|
"num_tokens": 202050.0,
|
|
"reward": -1.1099207401275635,
|
|
"reward_std": 2.573901891708374,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5274472236633301,
|
|
"rewards/env_reward/std": 1.6579262018203735,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.041,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.656517028808594,
|
|
"kl": 1.1007941216230392,
|
|
"learning_rate": 4.6555555555555556e-05,
|
|
"loss": 0.044,
|
|
"num_tokens": 204526.0,
|
|
"reward": -0.9679015278816223,
|
|
"reward_std": 2.6547322273254395,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.43276768922805786,
|
|
"rewards/env_reward/std": 1.7114882469177246,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0415,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.2242937088012695,
|
|
"kl": 1.004544973373413,
|
|
"learning_rate": 4.644444444444445e-05,
|
|
"loss": 0.0402,
|
|
"num_tokens": 207005.0,
|
|
"reward": -2.1635308265686035,
|
|
"reward_std": 3.2175371646881104,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.200687289237976,
|
|
"rewards/env_reward/std": 2.077667474746704,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.042,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.178193092346191,
|
|
"kl": 0.9078696174547076,
|
|
"learning_rate": 4.633333333333333e-05,
|
|
"loss": 0.0363,
|
|
"num_tokens": 209475.0,
|
|
"reward": 0.19402220845222473,
|
|
"reward_std": 0.2724432051181793,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.00833333283662796,
|
|
"rewards/belief_accuracy/std": 0.10671874135732651,
|
|
"rewards/env_reward/mean": 0.08351479470729828,
|
|
"rewards/env_reward/std": 0.14911670982837677,
|
|
"rewards/format_valid/mean": 0.875,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 20.25,
|
|
"completions/mean_terminated_length": 20.25,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.0425,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.778254508972168,
|
|
"kl": 1.0725902691483498,
|
|
"learning_rate": 4.6222222222222224e-05,
|
|
"loss": 0.0429,
|
|
"num_tokens": 211956.0,
|
|
"reward": -0.8444531559944153,
|
|
"reward_std": 2.7435097694396973,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0925000011920929,
|
|
"rewards/belief_accuracy/std": 0.015000000596046448,
|
|
"rewards/env_reward/mean": -0.3696354031562805,
|
|
"rewards/env_reward/std": 1.7566334009170532,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.043,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.498073101043701,
|
|
"kl": 0.7945144101977348,
|
|
"learning_rate": 4.6111111111111115e-05,
|
|
"loss": 0.0318,
|
|
"num_tokens": 214458.0,
|
|
"reward": -0.4258846640586853,
|
|
"reward_std": 0.525246798992157,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07500000298023224,
|
|
"rewards/belief_accuracy/std": 0.05000000074505806,
|
|
"rewards/env_reward/mean": -0.15475642681121826,
|
|
"rewards/env_reward/std": 0.276262104511261,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0435,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.651465892791748,
|
|
"kl": 1.144854974001646,
|
|
"learning_rate": 4.600000000000001e-05,
|
|
"loss": 0.0458,
|
|
"num_tokens": 216952.0,
|
|
"reward": -2.7960398197174072,
|
|
"reward_std": 2.4879508018493652,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10818149149417877,
|
|
"rewards/belief_accuracy/std": 0.016362976282835007,
|
|
"rewards/env_reward/mean": -1.610163688659668,
|
|
"rewards/env_reward/std": 1.6060127019882202,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.044,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.523915767669678,
|
|
"kl": 1.0343455001711845,
|
|
"learning_rate": 4.588888888888889e-05,
|
|
"loss": 0.0414,
|
|
"num_tokens": 219437.0,
|
|
"reward": -2.3472089767456055,
|
|
"reward_std": 3.005443811416626,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3231394290924072,
|
|
"rewards/env_reward/std": 1.9362717866897583,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 16.75,
|
|
"completions/mean_terminated_length": 16.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0445,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.352187395095825,
|
|
"kl": 1.272004920989275,
|
|
"learning_rate": 4.577777777777778e-05,
|
|
"loss": 0.0509,
|
|
"num_tokens": 221904.0,
|
|
"reward": -1.3286750316619873,
|
|
"reward_std": 2.4145350456237793,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.6441167593002319,
|
|
"rewards/env_reward/std": 1.5708537101745605,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.045,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.992652654647827,
|
|
"kl": 1.1837435215711594,
|
|
"learning_rate": 4.566666666666667e-05,
|
|
"loss": 0.0473,
|
|
"num_tokens": 224393.0,
|
|
"reward": -0.807397723197937,
|
|
"reward_std": 2.8079702854156494,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.3299318850040436,
|
|
"rewards/env_reward/std": 1.8133907318115234,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 19.25,
|
|
"completions/mean_terminated_length": 19.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0455,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.360654354095459,
|
|
"kl": 1.3510248363018036,
|
|
"learning_rate": 4.555555555555556e-05,
|
|
"loss": 0.054,
|
|
"num_tokens": 226870.0,
|
|
"reward": 0.35237032175064087,
|
|
"reward_std": 1.0852247476577759,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.418246865272522,
|
|
"rewards/env_reward/std": 0.723483145236969,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 19.666667938232422,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.046,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.634124517440796,
|
|
"kl": 0.8390218988060951,
|
|
"learning_rate": 4.5444444444444444e-05,
|
|
"loss": 0.0336,
|
|
"num_tokens": 229361.0,
|
|
"reward": -2.2453417778015137,
|
|
"reward_std": 3.167144298553467,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.25522780418396,
|
|
"rewards/env_reward/std": 2.0450401306152344,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 14.25,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0465,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.345480352640152,
|
|
"kl": 1.7472785264253616,
|
|
"learning_rate": 4.5333333333333335e-05,
|
|
"loss": 0.0699,
|
|
"num_tokens": 231818.0,
|
|
"reward": 0.20606237649917603,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3207082748413086,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 21.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.047,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.007046222686768,
|
|
"kl": 1.7106561437249184,
|
|
"learning_rate": 4.522222222222223e-05,
|
|
"loss": 0.0684,
|
|
"num_tokens": 234305.0,
|
|
"reward": -1.3136588335037231,
|
|
"reward_std": 2.4297609329223633,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6632725596427917,
|
|
"rewards/env_reward/std": 1.5616451501846313,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 94
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0475,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.215799808502197,
|
|
"kl": 2.4182121604681015,
|
|
"learning_rate": 4.511111111111112e-05,
|
|
"loss": 0.0967,
|
|
"num_tokens": 236746.0,
|
|
"reward": -1.4073553085327148,
|
|
"reward_std": 2.4502437114715576,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7257369160652161,
|
|
"rewards/env_reward/std": 1.5773454904556274,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.048,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8741865158081055,
|
|
"kl": 1.375985711812973,
|
|
"learning_rate": 4.5e-05,
|
|
"loss": 0.055,
|
|
"num_tokens": 239220.0,
|
|
"reward": -1.8366073369979858,
|
|
"reward_std": 2.075605630874634,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.0119048357009888,
|
|
"rewards/env_reward/std": 1.325404167175293,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 96
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.0485,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7931768894195557,
|
|
"kl": 1.1252032294869423,
|
|
"learning_rate": 4.4888888888888894e-05,
|
|
"loss": 0.045,
|
|
"num_tokens": 241690.0,
|
|
"reward": -0.21447324752807617,
|
|
"reward_std": 0.08092716336250305,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.040351178497076035,
|
|
"rewards/env_reward/std": 0.053951445966959,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.049,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6995668411254883,
|
|
"kl": 0.2477953266352415,
|
|
"learning_rate": 4.477777777777778e-05,
|
|
"loss": 0.0099,
|
|
"num_tokens": 243898.0,
|
|
"reward": 0.10004599392414093,
|
|
"reward_std": 0.12990380823612213,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.03336399421095848,
|
|
"rewards/env_reward/std": 0.08660253882408142,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 24.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0495,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.9689114093780518,
|
|
"kl": 1.3716598898172379,
|
|
"learning_rate": 4.466666666666667e-05,
|
|
"loss": 0.0549,
|
|
"num_tokens": 246394.0,
|
|
"reward": 0.05885888263583183,
|
|
"reward_std": 0.17086723446846008,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11083333194255829,
|
|
"rewards/belief_accuracy/std": 0.021666666492819786,
|
|
"rewards/env_reward/mean": 0.2400725781917572,
|
|
"rewards/env_reward/std": 0.08001596480607986,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 99
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.792644500732422,
|
|
"kl": 1.8348833322525024,
|
|
"learning_rate": 4.4555555555555555e-05,
|
|
"loss": 0.0734,
|
|
"num_tokens": 248839.0,
|
|
"reward": 0.2527257204055786,
|
|
"reward_std": 0.15090236067771912,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.35181713104248047,
|
|
"rewards/env_reward/std": 0.10060158371925354,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0505,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8620173931121826,
|
|
"kl": 1.4036446511745453,
|
|
"learning_rate": 4.4444444444444447e-05,
|
|
"loss": 0.0561,
|
|
"num_tokens": 251307.0,
|
|
"reward": -1.31059730052948,
|
|
"reward_std": 2.4275147914886475,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.1158333271741867,
|
|
"rewards/belief_accuracy/std": 0.03166666254401207,
|
|
"rewards/env_reward/mean": -0.6337315440177917,
|
|
"rewards/env_reward/std": 1.577512264251709,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 101
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.051,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1169137954711914,
|
|
"kl": 1.8475644141435623,
|
|
"learning_rate": 4.433333333333334e-05,
|
|
"loss": 0.0739,
|
|
"num_tokens": 253767.0,
|
|
"reward": -1.3670077323913574,
|
|
"reward_std": 2.399441719055176,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6988385319709778,
|
|
"rewards/env_reward/std": 1.5415664911270142,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 102
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 7.75,
|
|
"completions/mean_terminated_length": 7.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0515,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.684938430786133,
|
|
"kl": 2.059985037893057,
|
|
"learning_rate": 4.422222222222222e-05,
|
|
"loss": 0.0824,
|
|
"num_tokens": 256198.0,
|
|
"reward": -1.4205896854400635,
|
|
"reward_std": 2.3623111248016357,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7345598340034485,
|
|
"rewards/env_reward/std": 1.516780972480774,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 103
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.5,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.052,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5143520832061768,
|
|
"kl": 1.3098777011036873,
|
|
"learning_rate": 4.4111111111111114e-05,
|
|
"loss": 0.0524,
|
|
"num_tokens": 258632.0,
|
|
"reward": -0.0835796445608139,
|
|
"reward_std": 0.2586938738822937,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0363982617855072,
|
|
"rewards/belief_accuracy/std": 0.11159241199493408,
|
|
"rewards/env_reward/mean": -0.016256578266620636,
|
|
"rewards/env_reward/std": 0.26623615622520447,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 14.5,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0525,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.2300004959106445,
|
|
"kl": 1.785375103354454,
|
|
"learning_rate": 4.4000000000000006e-05,
|
|
"loss": 0.0714,
|
|
"num_tokens": 261090.0,
|
|
"reward": -0.08198876678943634,
|
|
"reward_std": 0.8859658241271973,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.12867416441440582,
|
|
"rewards/env_reward/std": 0.5906438827514648,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 16.25,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.053,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8233327865600586,
|
|
"kl": 1.1763433814048767,
|
|
"learning_rate": 4.388888888888889e-05,
|
|
"loss": 0.0471,
|
|
"num_tokens": 263555.0,
|
|
"reward": -2.6301207542419434,
|
|
"reward_std": 2.697817087173462,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.511747121810913,
|
|
"rewards/env_reward/std": 1.7316814661026,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 106
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0535,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.215397357940674,
|
|
"kl": 1.5616333931684494,
|
|
"learning_rate": 4.377777777777778e-05,
|
|
"loss": 0.0625,
|
|
"num_tokens": 266021.0,
|
|
"reward": -0.27485907077789307,
|
|
"reward_std": 0.8942175507545471,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 9.395182132720947e-05,
|
|
"rewards/env_reward/std": 0.5961450934410095,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.054,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.469447612762451,
|
|
"kl": 2.5851728469133377,
|
|
"learning_rate": 4.3666666666666666e-05,
|
|
"loss": 0.1034,
|
|
"num_tokens": 268461.0,
|
|
"reward": -1.1452138423919678,
|
|
"reward_std": 2.582923650741577,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5509759187698364,
|
|
"rewards/env_reward/std": 1.6647002696990967,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 108
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0545,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.712915420532227,
|
|
"kl": 2.6475657522678375,
|
|
"learning_rate": 4.355555555555556e-05,
|
|
"loss": 0.1059,
|
|
"num_tokens": 270898.0,
|
|
"reward": 0.12694786489009857,
|
|
"reward_std": 0.002898484468460083,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.26796525716781616,
|
|
"rewards/env_reward/std": 0.0019323229789733887,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.055,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.97365140914917,
|
|
"kl": 2.896424412727356,
|
|
"learning_rate": 4.344444444444445e-05,
|
|
"loss": 0.1159,
|
|
"num_tokens": 273372.0,
|
|
"reward": -0.7330765128135681,
|
|
"reward_std": 0.20495304465293884,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.06111111119389534,
|
|
"rewards/belief_accuracy/std": 0.07777778059244156,
|
|
"rewards/env_reward/mean": -0.38732877373695374,
|
|
"rewards/env_reward/std": 0.12224072217941284,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 110
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.0555,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.209381580352783,
|
|
"kl": 1.7662896811962128,
|
|
"learning_rate": 4.3333333333333334e-05,
|
|
"loss": 0.0707,
|
|
"num_tokens": 275814.0,
|
|
"reward": 0.027455374598503113,
|
|
"reward_std": 0.4914630353450775,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2016369253396988,
|
|
"rewards/env_reward/std": 0.3276420533657074,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.056,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.021775960922241,
|
|
"kl": 2.54108564555645,
|
|
"learning_rate": 4.3222222222222226e-05,
|
|
"loss": 0.1016,
|
|
"num_tokens": 278282.0,
|
|
"reward": -0.2888333201408386,
|
|
"reward_std": 0.15247361361980438,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.009222209453582764,
|
|
"rewards/env_reward/std": 0.10164907574653625,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0565,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.275335311889648,
|
|
"kl": 1.6538867950439453,
|
|
"learning_rate": 4.311111111111111e-05,
|
|
"loss": 0.0662,
|
|
"num_tokens": 280752.0,
|
|
"reward": 0.437593936920166,
|
|
"reward_std": 0.39731013774871826,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.47506263852119446,
|
|
"rewards/env_reward/std": 0.2648734450340271,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.057,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.433754920959473,
|
|
"kl": 1.647656962275505,
|
|
"learning_rate": 4.3e-05,
|
|
"loss": 0.0659,
|
|
"num_tokens": 283204.0,
|
|
"reward": -1.3276481628417969,
|
|
"reward_std": 2.414963722229004,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6725987792015076,
|
|
"rewards/env_reward/std": 1.551644206047058,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 114
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 10.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0575,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6330208778381348,
|
|
"kl": 1.515267439186573,
|
|
"learning_rate": 4.2888888888888886e-05,
|
|
"loss": 0.0606,
|
|
"num_tokens": 285667.0,
|
|
"reward": -3.7085390090942383,
|
|
"reward_std": 2.482921838760376,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.201525926589966,
|
|
"rewards/env_reward/std": 1.5969480276107788,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 115
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.058,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3021349906921387,
|
|
"kl": 1.7087249606847763,
|
|
"learning_rate": 4.277777777777778e-05,
|
|
"loss": 0.0683,
|
|
"num_tokens": 288114.0,
|
|
"reward": 0.5330584049224854,
|
|
"reward_std": 0.35837167501449585,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5387057065963745,
|
|
"rewards/env_reward/std": 0.23891450464725494,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 6.5,
|
|
"completions/mean_terminated_length": 6.5,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0585,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.497950553894043,
|
|
"kl": 2.1271141320466995,
|
|
"learning_rate": 4.266666666666667e-05,
|
|
"loss": 0.0851,
|
|
"num_tokens": 290540.0,
|
|
"reward": 0.4702581763267517,
|
|
"reward_std": 0.44036781787872314,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4968388080596924,
|
|
"rewards/env_reward/std": 0.293578565120697,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.059,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.789796829223633,
|
|
"kl": 1.4464631527662277,
|
|
"learning_rate": 4.255555555555556e-05,
|
|
"loss": 0.0579,
|
|
"num_tokens": 293023.0,
|
|
"reward": -0.7475403547286987,
|
|
"reward_std": 2.8080575466156006,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0703703761100769,
|
|
"rewards/belief_accuracy/std": 0.059259265661239624,
|
|
"rewards/env_reward/mean": -0.3492862284183502,
|
|
"rewards/env_reward/std": 1.7720966339111328,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 118
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0595,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.5602530241012573,
|
|
"kl": 1.6833766214549541,
|
|
"learning_rate": 4.2444444444444445e-05,
|
|
"loss": 0.0673,
|
|
"num_tokens": 295460.0,
|
|
"reward": -0.42602595686912537,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.10068397223949432,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.7347092628479,
|
|
"kl": 1.445710226893425,
|
|
"learning_rate": 4.233333333333334e-05,
|
|
"loss": 0.0578,
|
|
"num_tokens": 297907.0,
|
|
"reward": -1.0152814388275146,
|
|
"reward_std": 2.623145580291748,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.4643542170524597,
|
|
"rewards/env_reward/std": 1.6904305219650269,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 120
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0605,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.493462085723877,
|
|
"kl": 1.4399118982255459,
|
|
"learning_rate": 4.222222222222222e-05,
|
|
"loss": 0.0576,
|
|
"num_tokens": 300352.0,
|
|
"reward": -0.20259986817836761,
|
|
"reward_std": 0.06754998862743378,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.048266757279634476,
|
|
"rewards/env_reward/std": 0.04503332078456879,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.061,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4330313205718994,
|
|
"kl": 1.4334093481302261,
|
|
"learning_rate": 4.211111111111111e-05,
|
|
"loss": 0.0573,
|
|
"num_tokens": 302825.0,
|
|
"reward": -0.5075480937957764,
|
|
"reward_std": 0.6749432682991028,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.15503208339214325,
|
|
"rewards/env_reward/std": 0.44996219873428345,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0615,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.5208353996276855,
|
|
"kl": 1.7931447178125381,
|
|
"learning_rate": 4.2e-05,
|
|
"loss": 0.0717,
|
|
"num_tokens": 305291.0,
|
|
"reward": -2.456667900085449,
|
|
"reward_std": 2.895482063293457,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3961119651794434,
|
|
"rewards/env_reward/std": 1.863360047340393,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 123
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.062,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7689969539642334,
|
|
"kl": 0.8019402623176575,
|
|
"learning_rate": 4.188888888888889e-05,
|
|
"loss": 0.0321,
|
|
"num_tokens": 307775.0,
|
|
"reward": -1.4924118518829346,
|
|
"reward_std": 2.360426187515259,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.782441258430481,
|
|
"rewards/env_reward/std": 1.5167045593261719,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 124
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 7.0,
|
|
"completions/mean_terminated_length": 7.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6446032524108887,
|
|
"kl": 1.4752652198076248,
|
|
"learning_rate": 4.177777777777778e-05,
|
|
"loss": 0.059,
|
|
"num_tokens": 310203.0,
|
|
"reward": 0.8273366689682007,
|
|
"reward_std": 0.6383920311927795,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7348911762237549,
|
|
"rewards/env_reward/std": 0.4255947172641754,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 21.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 12.5,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.063,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.112602233886719,
|
|
"kl": 1.6532337069511414,
|
|
"learning_rate": 4.166666666666667e-05,
|
|
"loss": 0.0661,
|
|
"num_tokens": 312653.0,
|
|
"reward": -0.10410824418067932,
|
|
"reward_std": 0.01273045688867569,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11392784118652344,
|
|
"rewards/env_reward/std": 0.008486974984407425,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 14.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0635,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.351461887359619,
|
|
"kl": 2.1480718851089478,
|
|
"learning_rate": 4.155555555555556e-05,
|
|
"loss": 0.0859,
|
|
"num_tokens": 315110.0,
|
|
"reward": -0.3739127516746521,
|
|
"reward_std": 0.14415279030799866,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.06594181805849075,
|
|
"rewards/env_reward/std": 0.09610186517238617,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 18.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.064,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8212032318115234,
|
|
"kl": 1.534121721982956,
|
|
"learning_rate": 4.144444444444445e-05,
|
|
"loss": 0.0614,
|
|
"num_tokens": 317584.0,
|
|
"reward": -0.19062533974647522,
|
|
"reward_std": 0.3150945007801056,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0893678218126297,
|
|
"rewards/belief_accuracy/std": 0.021264348179101944,
|
|
"rewards/env_reward/mean": 0.03081876039505005,
|
|
"rewards/env_reward/std": 0.24891482293605804,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 128
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0645,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.938203811645508,
|
|
"kl": 2.1303387582302094,
|
|
"learning_rate": 4.133333333333333e-05,
|
|
"loss": 0.0852,
|
|
"num_tokens": 320029.0,
|
|
"reward": -1.1215572357177734,
|
|
"reward_std": 2.5629398822784424,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.06083333492279053,
|
|
"rewards/belief_accuracy/std": 0.07833334058523178,
|
|
"rewards/env_reward/mean": -0.6177048683166504,
|
|
"rewards/env_reward/std": 1.588196873664856,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 129
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 4.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.400176048278809,
|
|
"kl": 1.8505046516656876,
|
|
"learning_rate": 4.1222222222222224e-05,
|
|
"loss": 0.074,
|
|
"num_tokens": 322471.0,
|
|
"reward": -0.12519629299640656,
|
|
"reward_std": 0.14092496037483215,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.09986913949251175,
|
|
"rewards/env_reward/std": 0.09394997358322144,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 31.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 31.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 17.25,
|
|
"completions/mean_terminated_length": 17.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0655,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.194300413131714,
|
|
"kl": 1.2793779149651527,
|
|
"learning_rate": 4.111111111111111e-05,
|
|
"loss": 0.0512,
|
|
"num_tokens": 324940.0,
|
|
"reward": -1.0001001358032227,
|
|
"reward_std": 2.700178623199463,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.45423343777656555,
|
|
"rewards/env_reward/std": 1.743279218673706,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 131
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.066,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.973412036895752,
|
|
"kl": 1.9231543093919754,
|
|
"learning_rate": 4.1e-05,
|
|
"loss": 0.0769,
|
|
"num_tokens": 327385.0,
|
|
"reward": -1.7763125896453857,
|
|
"reward_std": 2.129706382751465,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.9717084169387817,
|
|
"rewards/env_reward/std": 1.3618686199188232,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 132
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 13.25,
|
|
"completions/mean_terminated_length": 7.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.070490598678589,
|
|
"kl": 1.150221362709999,
|
|
"learning_rate": 4.088888888888889e-05,
|
|
"loss": 0.046,
|
|
"num_tokens": 329838.0,
|
|
"reward": 0.4534025192260742,
|
|
"reward_std": 1.0941553115844727,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4856016933917999,
|
|
"rewards/env_reward/std": 0.7294369339942932,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 10.333333969116211,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.067,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.878156065940857,
|
|
"kl": 1.4795889034867287,
|
|
"learning_rate": 4.0777777777777783e-05,
|
|
"loss": 0.0592,
|
|
"num_tokens": 332301.0,
|
|
"reward": -2.0403780937194824,
|
|
"reward_std": 3.3793559074401855,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.1185853481292725,
|
|
"rewards/env_reward/std": 2.1859495639801025,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 134
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0675,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1908632516860962,
|
|
"kl": 1.9114599525928497,
|
|
"learning_rate": 4.066666666666667e-05,
|
|
"loss": 0.0765,
|
|
"num_tokens": 334745.0,
|
|
"reward": 0.5449367761611938,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5466245412826538,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 10.666666984558105,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.068,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5480082035064697,
|
|
"kl": 1.5377977713942528,
|
|
"learning_rate": 4.055555555555556e-05,
|
|
"loss": 0.0615,
|
|
"num_tokens": 337209.0,
|
|
"reward": -1.2076761722564697,
|
|
"reward_std": 2.4953103065490723,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5926175117492676,
|
|
"rewards/env_reward/std": 1.6052173376083374,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 136
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.0685,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0419105291366577,
|
|
"kl": 1.1934361532330513,
|
|
"learning_rate": 4.0444444444444444e-05,
|
|
"loss": 0.0477,
|
|
"num_tokens": 339665.0,
|
|
"reward": -1.1166263818740845,
|
|
"reward_std": 2.606243848800659,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5319175720214844,
|
|
"rewards/env_reward/std": 1.6803356409072876,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 137
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.069,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.009521007537842,
|
|
"kl": 2.4323032796382904,
|
|
"learning_rate": 4.0333333333333336e-05,
|
|
"loss": 0.0973,
|
|
"num_tokens": 342138.0,
|
|
"reward": 0.6563852429389954,
|
|
"reward_std": 0.8735789656639099,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.057500001043081284,
|
|
"rewards/belief_accuracy/std": 0.08499999344348907,
|
|
"rewards/env_reward/mean": 0.5317568182945251,
|
|
"rewards/env_reward/std": 0.6170323491096497,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 138
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 8.75,
|
|
"completions/mean_terminated_length": 8.75,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.0695,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7213194370269775,
|
|
"kl": 3.0655910074710846,
|
|
"learning_rate": 4.022222222222222e-05,
|
|
"loss": 0.1226,
|
|
"num_tokens": 344573.0,
|
|
"reward": 0.5249032378196716,
|
|
"reward_std": 0.1243140697479248,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5332688093185425,
|
|
"rewards/env_reward/std": 0.0828760415315628,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.07,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.32563015818595886,
|
|
"kl": 2.0985984057188034,
|
|
"learning_rate": 4.011111111111111e-05,
|
|
"loss": 0.0839,
|
|
"num_tokens": 347006.0,
|
|
"reward": -0.07693907618522644,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1320406198501587,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 14.75,
|
|
"completions/mean_terminated_length": 14.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0705,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0964287742972374,
|
|
"kl": 1.2693939208984375,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.0508,
|
|
"num_tokens": 349465.0,
|
|
"reward": -0.02015012502670288,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.16989992558956146,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.071,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.09562604129314423,
|
|
"kl": 1.128716617822647,
|
|
"learning_rate": 3.9888888888888895e-05,
|
|
"loss": 0.0451,
|
|
"num_tokens": 351950.0,
|
|
"reward": 0.7584548592567444,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6889699697494507,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0715,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.745733261108398,
|
|
"kl": 0.7463721930980682,
|
|
"learning_rate": 3.977777777777778e-05,
|
|
"loss": 0.0299,
|
|
"num_tokens": 354455.0,
|
|
"reward": 0.3327590823173523,
|
|
"reward_std": 0.23017629981040955,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.405172735452652,
|
|
"rewards/env_reward/std": 0.1534508764743805,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.072,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.888156890869141,
|
|
"kl": 1.4033671617507935,
|
|
"learning_rate": 3.966666666666667e-05,
|
|
"loss": 0.0561,
|
|
"num_tokens": 356946.0,
|
|
"reward": -0.13394379615783691,
|
|
"reward_std": 0.41236963868141174,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.09403747320175171,
|
|
"rewards/env_reward/std": 0.27491310238838196,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 8.75,
|
|
"completions/mean_terminated_length": 8.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0725,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.65702486038208,
|
|
"kl": 1.6139360815286636,
|
|
"learning_rate": 3.9555555555555556e-05,
|
|
"loss": 0.0646,
|
|
"num_tokens": 359381.0,
|
|
"reward": -1.7238547801971436,
|
|
"reward_std": 2.1609649658203125,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.12072296440601349,
|
|
"rewards/belief_accuracy/std": 0.07126190513372421,
|
|
"rewards/env_reward/mean": -0.9077907204627991,
|
|
"rewards/env_reward/std": 1.3948062658309937,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 145
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.073,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8972437381744385,
|
|
"kl": 1.160056695342064,
|
|
"learning_rate": 3.944444444444445e-05,
|
|
"loss": 0.0464,
|
|
"num_tokens": 361855.0,
|
|
"reward": 0.13612942397594452,
|
|
"reward_std": 0.02916666865348816,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0972222238779068,
|
|
"rewards/belief_accuracy/std": 0.0055555556900799274,
|
|
"rewards/env_reward/mean": 0.2643640637397766,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 146
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0735,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.382909774780273,
|
|
"kl": 1.568796619772911,
|
|
"learning_rate": 3.933333333333333e-05,
|
|
"loss": 0.0628,
|
|
"num_tokens": 364331.0,
|
|
"reward": 0.800137996673584,
|
|
"reward_std": 0.3069959282875061,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7167587280273438,
|
|
"rewards/env_reward/std": 0.2046639323234558,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.074,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.64746379852295,
|
|
"kl": 0.8096725344657898,
|
|
"learning_rate": 3.922222222222223e-05,
|
|
"loss": 0.0324,
|
|
"num_tokens": 366833.0,
|
|
"reward": -0.15847638249397278,
|
|
"reward_std": 1.3163249492645264,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.07768243551254272,
|
|
"rewards/env_reward/std": 0.877549946308136,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 12.75,
|
|
"completions/mean_terminated_length": 12.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0745,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.473668098449707,
|
|
"kl": 2.7600976526737213,
|
|
"learning_rate": 3.9111111111111115e-05,
|
|
"loss": 0.1104,
|
|
"num_tokens": 369284.0,
|
|
"reward": -2.9569857120513916,
|
|
"reward_std": 2.3198444843292236,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.7296571731567383,
|
|
"rewards/env_reward/std": 1.4797673225402832,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 149
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 31.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 31.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 21.5,
|
|
"completions/mean_terminated_length": 21.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.437213897705078,
|
|
"kl": 1.6173148602247238,
|
|
"learning_rate": 3.9000000000000006e-05,
|
|
"loss": 0.0647,
|
|
"num_tokens": 371770.0,
|
|
"reward": 0.5740416049957275,
|
|
"reward_std": 0.233365997672081,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.13980931043624878,
|
|
"rewards/belief_accuracy/std": 0.07961863279342651,
|
|
"rewards/env_reward/mean": 0.6414797306060791,
|
|
"rewards/env_reward/std": 0.1132500022649765,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 150
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0755,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2341485172510147,
|
|
"kl": 1.3624602407217026,
|
|
"learning_rate": 3.888888888888889e-05,
|
|
"loss": 0.0545,
|
|
"num_tokens": 374253.0,
|
|
"reward": 0.2103109359741211,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3235406279563904,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 23.33333396911621,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.076,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.359372615814209,
|
|
"kl": 1.1529072970151901,
|
|
"learning_rate": 3.877777777777778e-05,
|
|
"loss": 0.0461,
|
|
"num_tokens": 376755.0,
|
|
"reward": 0.4146992564201355,
|
|
"reward_std": 0.46390998363494873,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.45979946851730347,
|
|
"rewards/env_reward/std": 0.3092733323574066,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0765,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.655915260314941,
|
|
"kl": 1.3275744514539838,
|
|
"learning_rate": 3.866666666666667e-05,
|
|
"loss": 0.0531,
|
|
"num_tokens": 379211.0,
|
|
"reward": -0.0012441501021385193,
|
|
"reward_std": 0.24833057820796967,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.23028355836868286,
|
|
"rewards/belief_accuracy/std": 0.09518812596797943,
|
|
"rewards/env_reward/mean": 0.42640429735183716,
|
|
"rewards/env_reward/std": 0.10868140310049057,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.077,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.993246555328369,
|
|
"kl": 1.3614933341741562,
|
|
"learning_rate": 3.855555555555556e-05,
|
|
"loss": 0.0545,
|
|
"num_tokens": 381699.0,
|
|
"reward": -0.3912268280982971,
|
|
"reward_std": 3.040301561355591,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.01915118098258972,
|
|
"rewards/env_reward/std": 1.9872325658798218,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 154
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0775,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0937976986169815,
|
|
"kl": 1.2733041644096375,
|
|
"learning_rate": 3.844444444444444e-05,
|
|
"loss": 0.0509,
|
|
"num_tokens": 384182.0,
|
|
"reward": -0.1396826207637787,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.09021158516407013,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.078,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.676441669464111,
|
|
"kl": 0.9261074624955654,
|
|
"learning_rate": 3.8333333333333334e-05,
|
|
"loss": 0.037,
|
|
"num_tokens": 386677.0,
|
|
"reward": -1.3367525339126587,
|
|
"reward_std": 2.411214828491211,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6786683797836304,
|
|
"rewards/env_reward/std": 1.5492030382156372,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 156
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0785,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1922104358673096,
|
|
"kl": 0.0721854604780674,
|
|
"learning_rate": 3.8222222222222226e-05,
|
|
"loss": 0.0029,
|
|
"num_tokens": 389109.0,
|
|
"reward": 0.8821967244148254,
|
|
"reward_std": 0.15713486075401306,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.31481480598449707,
|
|
"rewards/belief_accuracy/std": 0.052378278225660324,
|
|
"rewards/env_reward/mean": 1.1844274997711182,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.079,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.775229454040527,
|
|
"kl": 1.4617139548063278,
|
|
"learning_rate": 3.811111111111112e-05,
|
|
"loss": 0.0585,
|
|
"num_tokens": 391575.0,
|
|
"reward": -0.6259194612503052,
|
|
"reward_std": 0.5253891348838806,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10740740597248077,
|
|
"rewards/belief_accuracy/std": 0.014814812690019608,
|
|
"rewards/env_reward/mean": -0.22329813241958618,
|
|
"rewards/env_reward/std": 0.3683049976825714,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 158
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0795,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7022299766540527,
|
|
"kl": 1.3450734540820122,
|
|
"learning_rate": 3.8e-05,
|
|
"loss": 0.0538,
|
|
"num_tokens": 394062.0,
|
|
"reward": -1.487056851387024,
|
|
"reward_std": 2.3527774810791016,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0871676579117775,
|
|
"rewards/belief_accuracy/std": 0.025664685294032097,
|
|
"rewards/env_reward/mean": -0.8087027072906494,
|
|
"rewards/env_reward/std": 1.5039740800857544,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 159
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.08,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.860130310058594,
|
|
"kl": 1.0795547626912594,
|
|
"learning_rate": 3.7888888888888894e-05,
|
|
"loss": 0.0432,
|
|
"num_tokens": 396557.0,
|
|
"reward": -0.01648128777742386,
|
|
"reward_std": 0.3920603096485138,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1723458170890808,
|
|
"rewards/env_reward/std": 0.2613735496997833,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 18.666667938232422,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0805,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8319482803344727,
|
|
"kl": 1.2551886662840843,
|
|
"learning_rate": 3.777777777777778e-05,
|
|
"loss": 0.0502,
|
|
"num_tokens": 399045.0,
|
|
"reward": 0.17859038710594177,
|
|
"reward_std": 0.8184104561805725,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3023936152458191,
|
|
"rewards/env_reward/std": 0.5456069707870483,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.081,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.120696783065796,
|
|
"kl": 0.8586160615086555,
|
|
"learning_rate": 3.766666666666667e-05,
|
|
"loss": 0.0343,
|
|
"num_tokens": 401559.0,
|
|
"reward": 0.4954003691673279,
|
|
"reward_std": 0.3572309911251068,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10083333402872086,
|
|
"rewards/belief_accuracy/std": 0.0016666651936247945,
|
|
"rewards/env_reward/mean": 0.5111002922058105,
|
|
"rewards/env_reward/std": 0.24086414277553558,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 162
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 21.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0815,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.276159286499023,
|
|
"kl": 2.0177499651908875,
|
|
"learning_rate": 3.7555555555555554e-05,
|
|
"loss": 0.0807,
|
|
"num_tokens": 404022.0,
|
|
"reward": -0.14451055228710175,
|
|
"reward_std": 0.07916668057441711,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08888889104127884,
|
|
"rewards/belief_accuracy/std": 0.02222222276031971,
|
|
"rewards/env_reward/mean": 0.06060408055782318,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 163
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.082,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.653500080108643,
|
|
"kl": 1.8464947640895844,
|
|
"learning_rate": 3.7444444444444446e-05,
|
|
"loss": 0.0739,
|
|
"num_tokens": 406485.0,
|
|
"reward": 0.27019041776657104,
|
|
"reward_std": 0.23719999194145203,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11416666209697723,
|
|
"rewards/belief_accuracy/std": 0.028333332389593124,
|
|
"rewards/env_reward/mean": 0.3876269459724426,
|
|
"rewards/env_reward/std": 0.10980000346899033,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 164
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 19.33333396911621,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0825,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.24756693840026855,
|
|
"kl": 1.6211326867341995,
|
|
"learning_rate": 3.733333333333334e-05,
|
|
"loss": 0.0648,
|
|
"num_tokens": 408975.0,
|
|
"reward": 0.03895732760429382,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.20930489897727966,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.083,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.162034034729004,
|
|
"kl": 1.216068983078003,
|
|
"learning_rate": 3.722222222222222e-05,
|
|
"loss": 0.0486,
|
|
"num_tokens": 411464.0,
|
|
"reward": -0.5613082051277161,
|
|
"reward_std": 0.08749997615814209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.16170544922351837,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 166
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0835,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.356938362121582,
|
|
"kl": 2.8624762892723083,
|
|
"learning_rate": 3.7111111111111113e-05,
|
|
"loss": 0.1145,
|
|
"num_tokens": 413907.0,
|
|
"reward": 0.14997538924217224,
|
|
"reward_std": 0.7540647387504578,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2833169400691986,
|
|
"rewards/env_reward/std": 0.5027098655700684,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.084,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.261693477630615,
|
|
"kl": 1.933813601732254,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 0.0774,
|
|
"num_tokens": 416402.0,
|
|
"reward": -0.05508837103843689,
|
|
"reward_std": 0.11999882757663727,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.09027226269245148,
|
|
"rewards/belief_accuracy/std": 0.019455470144748688,
|
|
"rewards/env_reward/mean": 0.12298562377691269,
|
|
"rewards/env_reward/std": 0.08219999819993973,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 168
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 21.25,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0845,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.13167142868042,
|
|
"kl": 2.3502594381570816,
|
|
"learning_rate": 3.688888888888889e-05,
|
|
"loss": 0.094,
|
|
"num_tokens": 418887.0,
|
|
"reward": 0.03691243380308151,
|
|
"reward_std": 0.08749999105930328,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.2371082901954651,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 169
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.5,
|
|
"completions/mean_terminated_length": 15.333333969116211,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.085,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1629037857055664,
|
|
"kl": 1.5842487215995789,
|
|
"learning_rate": 3.677777777777778e-05,
|
|
"loss": 0.0634,
|
|
"num_tokens": 421365.0,
|
|
"reward": 0.5254287719726562,
|
|
"reward_std": 0.11898240447044373,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.533619225025177,
|
|
"rewards/env_reward/std": 0.07932159304618835,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 21.5,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0855,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.2538228034973145,
|
|
"kl": 1.3759911209344864,
|
|
"learning_rate": 3.6666666666666666e-05,
|
|
"loss": 0.055,
|
|
"num_tokens": 423851.0,
|
|
"reward": 1.0300487279891968,
|
|
"reward_std": 0.04658208787441254,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1066666692495346,
|
|
"rewards/belief_accuracy/std": 0.013333332724869251,
|
|
"rewards/env_reward/mean": 0.8791991472244263,
|
|
"rewards/env_reward/std": 0.01968872733414173,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 171
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 17.75,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.086,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.15501928329467773,
|
|
"kl": 1.7516742050647736,
|
|
"learning_rate": 3.655555555555556e-05,
|
|
"loss": 0.0701,
|
|
"num_tokens": 426322.0,
|
|
"reward": 0.030182331800460815,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2034548968076706,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0865,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 37.33258056640625,
|
|
"kl": 8.386772617697716,
|
|
"learning_rate": 3.644444444444445e-05,
|
|
"loss": 0.3355,
|
|
"num_tokens": 428763.0,
|
|
"reward": -0.9025059342384338,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.41833725571632385,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.087,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.22313672304153442,
|
|
"kl": 1.8691215515136719,
|
|
"learning_rate": 3.633333333333333e-05,
|
|
"loss": 0.0748,
|
|
"num_tokens": 431237.0,
|
|
"reward": -0.16483666002750397,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.07344222813844681,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 18.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.1793341636657715,
|
|
"kl": 2.593918561935425,
|
|
"learning_rate": 3.6222222222222225e-05,
|
|
"loss": 0.1038,
|
|
"num_tokens": 433710.0,
|
|
"reward": -0.9837551116943359,
|
|
"reward_std": 2.6553149223327637,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.44333672523498535,
|
|
"rewards/env_reward/std": 1.712130069732666,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 175
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.088,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.10414294898509979,
|
|
"kl": 1.4019053727388382,
|
|
"learning_rate": 3.611111111111111e-05,
|
|
"loss": 0.0561,
|
|
"num_tokens": 436184.0,
|
|
"reward": -0.12919571995735168,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.0972028523683548,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0885,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.139967441558838,
|
|
"kl": 1.0769911333918571,
|
|
"learning_rate": 3.6e-05,
|
|
"loss": 0.0431,
|
|
"num_tokens": 438657.0,
|
|
"reward": 0.6967830657958984,
|
|
"reward_std": 0.08670443296432495,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6478554010391235,
|
|
"rewards/env_reward/std": 0.05780297517776489,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 10.666666984558105,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.089,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0620081424713135,
|
|
"kl": 1.0732092261314392,
|
|
"learning_rate": 3.5888888888888886e-05,
|
|
"loss": 0.0429,
|
|
"num_tokens": 441121.0,
|
|
"reward": -1.2789283990859985,
|
|
"reward_std": 2.598663568496704,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6401189565658569,
|
|
"rewards/env_reward/std": 1.6776195764541626,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 178
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.0895,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.18922147154808044,
|
|
"kl": 0.8863924369215965,
|
|
"learning_rate": 3.577777777777778e-05,
|
|
"loss": 0.0355,
|
|
"num_tokens": 443635.0,
|
|
"reward": 1.347588062286377,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.0817253589630127,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.09,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3570539951324463,
|
|
"kl": 1.9235362261533737,
|
|
"learning_rate": 3.566666666666667e-05,
|
|
"loss": 0.0769,
|
|
"num_tokens": 446101.0,
|
|
"reward": 0.05883501470088959,
|
|
"reward_std": 0.5488622784614563,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2225566804409027,
|
|
"rewards/env_reward/std": 0.3659081757068634,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 17.33333396911621,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0905,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.08723417669534683,
|
|
"kl": 1.3284604251384735,
|
|
"learning_rate": 3.555555555555556e-05,
|
|
"loss": 0.0531,
|
|
"num_tokens": 448585.0,
|
|
"reward": 0.6743147373199463,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6328765153884888,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 9.666666984558105,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.091,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3371059894561768,
|
|
"kl": 1.4546705782413483,
|
|
"learning_rate": 3.5444444444444445e-05,
|
|
"loss": 0.0582,
|
|
"num_tokens": 451046.0,
|
|
"reward": -0.3032863438129425,
|
|
"reward_std": 0.33148258924484253,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.018857555463910103,
|
|
"rewards/env_reward/std": 0.22098839282989502,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0915,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.911696195602417,
|
|
"kl": 1.4696582406759262,
|
|
"learning_rate": 3.5333333333333336e-05,
|
|
"loss": 0.0588,
|
|
"num_tokens": 453544.0,
|
|
"reward": -2.4303359985351562,
|
|
"reward_std": 2.9513607025146484,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3785573244094849,
|
|
"rewards/env_reward/std": 1.9012062549591064,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 183
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.0,
|
|
"completions/mean_terminated_length": 13.333333969116211,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.092,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6932320594787598,
|
|
"kl": 1.533248096704483,
|
|
"learning_rate": 3.522222222222222e-05,
|
|
"loss": 0.0613,
|
|
"num_tokens": 456016.0,
|
|
"reward": -1.5931193828582764,
|
|
"reward_std": 3.8826847076416016,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.8204129934310913,
|
|
"rewards/env_reward/std": 2.521214485168457,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 184
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 12.25,
|
|
"completions/mean_terminated_length": 12.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0925,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.075229167938232,
|
|
"kl": 1.968793198466301,
|
|
"learning_rate": 3.511111111111111e-05,
|
|
"loss": 0.0788,
|
|
"num_tokens": 458465.0,
|
|
"reward": -0.23590603470802307,
|
|
"reward_std": 0.2219301015138626,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.026062656193971634,
|
|
"rewards/env_reward/std": 0.14795339107513428,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.093,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.784778594970703,
|
|
"kl": 2.159162014722824,
|
|
"learning_rate": 3.5e-05,
|
|
"loss": 0.0864,
|
|
"num_tokens": 460898.0,
|
|
"reward": -0.3167000114917755,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.027799999341368675,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 29.25,
|
|
"completions/mean_terminated_length": 26.5,
|
|
"completions/min_length": 23.0,
|
|
"completions/min_terminated_length": 23.0,
|
|
"epoch": 0.0935,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.384214162826538,
|
|
"kl": 1.2219679579138756,
|
|
"learning_rate": 3.4888888888888895e-05,
|
|
"loss": 0.0489,
|
|
"num_tokens": 463415.0,
|
|
"reward": -1.0766644477844238,
|
|
"reward_std": 2.582223653793335,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5052763223648071,
|
|
"rewards/env_reward/std": 1.6631492376327515,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 187
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 14.75,
|
|
"completions/mean_terminated_length": 14.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.094,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3013856410980225,
|
|
"kl": 1.2444797977805138,
|
|
"learning_rate": 3.477777777777778e-05,
|
|
"loss": 0.0498,
|
|
"num_tokens": 465874.0,
|
|
"reward": -2.264209508895874,
|
|
"reward_std": 3.1193079948425293,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.2678064107894897,
|
|
"rewards/env_reward/std": 2.0125834941864014,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 188
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0945,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.398057460784912,
|
|
"kl": 1.1671398282051086,
|
|
"learning_rate": 3.466666666666667e-05,
|
|
"loss": 0.0467,
|
|
"num_tokens": 468342.0,
|
|
"reward": -0.32055363059043884,
|
|
"reward_std": 0.08688756823539734,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.03036908432841301,
|
|
"rewards/env_reward/std": 0.05792504921555519,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 13.5,
|
|
"completions/mean_terminated_length": 7.333333492279053,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.095,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1980197429656982,
|
|
"kl": 1.4500057846307755,
|
|
"learning_rate": 3.4555555555555556e-05,
|
|
"loss": 0.058,
|
|
"num_tokens": 470796.0,
|
|
"reward": 0.38532906770706177,
|
|
"reward_std": 0.15841148793697357,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4402194023132324,
|
|
"rewards/env_reward/std": 0.10560767352581024,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.0955,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.9431092739105225,
|
|
"kl": 1.4747809767723083,
|
|
"learning_rate": 3.444444444444445e-05,
|
|
"loss": 0.059,
|
|
"num_tokens": 473289.0,
|
|
"reward": -1.0166206359863281,
|
|
"reward_std": 0.02063235081732273,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.49441370368003845,
|
|
"rewards/env_reward/std": 0.0137548903003335,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.096,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.505349159240723,
|
|
"kl": 1.3912545293569565,
|
|
"learning_rate": 3.433333333333333e-05,
|
|
"loss": 0.0557,
|
|
"num_tokens": 475757.0,
|
|
"reward": -0.6259548664093018,
|
|
"reward_std": 2.8915553092956543,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08980958163738251,
|
|
"rewards/belief_accuracy/std": 0.02038082852959633,
|
|
"rewards/env_reward/mean": -0.22935077548027039,
|
|
"rewards/env_reward/std": 1.8575823307037354,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 192
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0965,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4287314414978027,
|
|
"kl": 1.0645422227680683,
|
|
"learning_rate": 3.4222222222222224e-05,
|
|
"loss": 0.0426,
|
|
"num_tokens": 478239.0,
|
|
"reward": -1.1361416578292847,
|
|
"reward_std": 2.5584716796875,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.18546631932258606,
|
|
"rewards/belief_accuracy/std": 0.059523556381464005,
|
|
"rewards/env_reward/mean": -0.3864951729774475,
|
|
"rewards/env_reward/std": 1.7521181106567383,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 193
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.097,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.376044511795044,
|
|
"kl": 0.6852857172489166,
|
|
"learning_rate": 3.411111111111111e-05,
|
|
"loss": 0.0274,
|
|
"num_tokens": 480748.0,
|
|
"reward": 0.5333235263824463,
|
|
"reward_std": 0.08749997615814209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.5680490136146545,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 194
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 13.75,
|
|
"completions/mean_terminated_length": 13.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0975,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.4659929275512695,
|
|
"kl": 1.7528847455978394,
|
|
"learning_rate": 3.4000000000000007e-05,
|
|
"loss": 0.0701,
|
|
"num_tokens": 483203.0,
|
|
"reward": 1.018233299255371,
|
|
"reward_std": 0.009551048278808594,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.862155556678772,
|
|
"rewards/env_reward/std": 0.006367385853081942,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.098,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.004715442657471,
|
|
"kl": 0.813011210411787,
|
|
"learning_rate": 3.388888888888889e-05,
|
|
"loss": 0.0325,
|
|
"num_tokens": 485711.0,
|
|
"reward": -2.4635062217712402,
|
|
"reward_std": 2.8720784187316895,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.4006710052490234,
|
|
"rewards/env_reward/std": 1.8473838567733765,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 196
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0985,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.282201051712036,
|
|
"kl": 0.7132957000285387,
|
|
"learning_rate": 3.377777777777778e-05,
|
|
"loss": 0.0285,
|
|
"num_tokens": 488216.0,
|
|
"reward": -3.926431894302368,
|
|
"reward_std": 2.047135829925537,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.346787929534912,
|
|
"rewards/env_reward/std": 1.3064239025115967,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 197
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.099,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.8574674129486084,
|
|
"kl": 1.1133069694042206,
|
|
"learning_rate": 3.366666666666667e-05,
|
|
"loss": 0.0445,
|
|
"num_tokens": 490705.0,
|
|
"reward": 0.6392979621887207,
|
|
"reward_std": 0.2728678584098816,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.044744670391082764,
|
|
"rewards/belief_accuracy/std": 0.11051066219806671,
|
|
"rewards/env_reward/mean": 0.4948546886444092,
|
|
"rewards/env_reward/std": 0.41126659512519836,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 198
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0995,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2241978645324707,
|
|
"kl": 1.400051310658455,
|
|
"learning_rate": 3.355555555555556e-05,
|
|
"loss": 0.056,
|
|
"num_tokens": 493194.0,
|
|
"reward": 0.16355225443840027,
|
|
"reward_std": 0.2303662747144699,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.29236820340156555,
|
|
"rewards/env_reward/std": 0.153577521443367,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 29.75,
|
|
"completions/mean_terminated_length": 27.5,
|
|
"completions/min_length": 25.0,
|
|
"completions/min_terminated_length": 25.0,
|
|
"epoch": 0.1,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1241228580474854,
|
|
"kl": 0.7815838046371937,
|
|
"learning_rate": 3.3444444444444443e-05,
|
|
"loss": 0.0313,
|
|
"num_tokens": 495713.0,
|
|
"reward": -1.4204142093658447,
|
|
"reward_std": 2.6858582496643066,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7344428300857544,
|
|
"rewards/env_reward/std": 1.739694356918335,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 200
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.880967617034912,
|
|
"kl": 1.6194000542163849,
|
|
"learning_rate": 3.3333333333333335e-05,
|
|
"loss": 0.0648,
|
|
"num_tokens": 498186.0,
|
|
"reward": -1.1433579921722412,
|
|
"reward_std": 2.5394091606140137,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5497387051582336,
|
|
"rewards/env_reward/std": 1.6346454620361328,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 201
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.101,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6157476902008057,
|
|
"kl": 1.4809669330716133,
|
|
"learning_rate": 3.322222222222222e-05,
|
|
"loss": 0.0592,
|
|
"num_tokens": 500648.0,
|
|
"reward": -0.7693363428115845,
|
|
"reward_std": 2.7953262329101562,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0716666653752327,
|
|
"rewards/belief_accuracy/std": 0.05666666850447655,
|
|
"rewards/env_reward/mean": -0.3612242341041565,
|
|
"rewards/env_reward/std": 1.7597646713256836,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 202
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 7.75,
|
|
"completions/mean_terminated_length": 7.75,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5044844150543213,
|
|
"kl": 0.5702618137001991,
|
|
"learning_rate": 3.311111111111112e-05,
|
|
"loss": 0.0228,
|
|
"num_tokens": 503079.0,
|
|
"reward": -0.09709322452545166,
|
|
"reward_std": 0.09302432835102081,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1680680364370346,
|
|
"rewards/belief_accuracy/std": 0.03100811131298542,
|
|
"rewards/env_reward/mean": 0.2380739152431488,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 24.0,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.102,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.225451707839966,
|
|
"kl": 0.6446680650115013,
|
|
"learning_rate": 3.3e-05,
|
|
"loss": 0.0258,
|
|
"num_tokens": 505599.0,
|
|
"reward": -1.0992940664291382,
|
|
"reward_std": 2.570491075515747,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.49119603633880615,
|
|
"rewards/env_reward/std": 1.673166036605835,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 204
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.69345760345459,
|
|
"kl": 0.9406535923480988,
|
|
"learning_rate": 3.2888888888888894e-05,
|
|
"loss": 0.0376,
|
|
"num_tokens": 508093.0,
|
|
"reward": -1.0868068933486938,
|
|
"reward_std": 2.6226813793182373,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5120379328727722,
|
|
"rewards/env_reward/std": 1.6912070512771606,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 205
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.103,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8141772747039795,
|
|
"kl": 0.38117800280451775,
|
|
"learning_rate": 3.277777777777778e-05,
|
|
"loss": 0.0152,
|
|
"num_tokens": 510301.0,
|
|
"reward": 0.5716937780380249,
|
|
"reward_std": 0.2175557017326355,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.34779584407806396,
|
|
"rewards/env_reward/std": 0.14503712952136993,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.1035,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4618332386016846,
|
|
"kl": 1.3801769241690636,
|
|
"learning_rate": 3.266666666666667e-05,
|
|
"loss": 0.0552,
|
|
"num_tokens": 512788.0,
|
|
"reward": -1.348587989807129,
|
|
"reward_std": 2.476418972015381,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6865587830543518,
|
|
"rewards/env_reward/std": 1.5944546461105347,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 207
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.104,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.705834865570068,
|
|
"kl": 1.0085995495319366,
|
|
"learning_rate": 3.2555555555555555e-05,
|
|
"loss": 0.0403,
|
|
"num_tokens": 515256.0,
|
|
"reward": -1.0385560989379883,
|
|
"reward_std": 2.6225454807281494,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.4798707962036133,
|
|
"rewards/env_reward/std": 1.6903735399246216,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 208
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 11.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1045,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.924132823944092,
|
|
"kl": 1.7609535232186317,
|
|
"learning_rate": 3.2444444444444446e-05,
|
|
"loss": 0.0704,
|
|
"num_tokens": 517722.0,
|
|
"reward": -1.3413997888565063,
|
|
"reward_std": 2.4143919944763184,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6817665696144104,
|
|
"rewards/env_reward/std": 1.5514785051345825,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 209
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.105,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.369988918304443,
|
|
"kl": 1.1772667318582535,
|
|
"learning_rate": 3.233333333333333e-05,
|
|
"loss": 0.0471,
|
|
"num_tokens": 520224.0,
|
|
"reward": 0.25493913888931274,
|
|
"reward_std": 0.33257579803466797,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3532927930355072,
|
|
"rewards/env_reward/std": 0.2217172235250473,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.1055,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5913333892822266,
|
|
"kl": 0.7529645264148712,
|
|
"learning_rate": 3.222222222222223e-05,
|
|
"loss": 0.0301,
|
|
"num_tokens": 522738.0,
|
|
"reward": -1.3859096765518188,
|
|
"reward_std": 2.4013755321502686,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7114397883415222,
|
|
"rewards/env_reward/std": 1.5432217121124268,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 211
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.106,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.363038539886475,
|
|
"kl": 1.2446223124861717,
|
|
"learning_rate": 3.2111111111111114e-05,
|
|
"loss": 0.0498,
|
|
"num_tokens": 525236.0,
|
|
"reward": -2.56288743019104,
|
|
"reward_std": 2.7684178352355957,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.466925024986267,
|
|
"rewards/env_reward/std": 1.7785577774047852,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 212
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.272139549255371,
|
|
"kl": 1.5829559713602066,
|
|
"learning_rate": 3.2000000000000005e-05,
|
|
"loss": 0.0633,
|
|
"num_tokens": 527450.0,
|
|
"reward": 1.3202344179153442,
|
|
"reward_std": 0.7838823199272156,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.84682297706604,
|
|
"rewards/env_reward/std": 0.5225882530212402,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.107,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.658717632293701,
|
|
"kl": 1.2298424392938614,
|
|
"learning_rate": 3.188888888888889e-05,
|
|
"loss": 0.0492,
|
|
"num_tokens": 529929.0,
|
|
"reward": -1.5011694431304932,
|
|
"reward_std": 2.299220323562622,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.78827965259552,
|
|
"rewards/env_reward/std": 1.474480390548706,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 214
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.059485912322998,
|
|
"kl": 0.9900188595056534,
|
|
"learning_rate": 3.177777777777778e-05,
|
|
"loss": 0.0396,
|
|
"num_tokens": 532434.0,
|
|
"reward": -2.2220005989074707,
|
|
"reward_std": 3.1529808044433594,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.2396671772003174,
|
|
"rewards/env_reward/std": 2.0346951484680176,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 215
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 28.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.108,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.903228282928467,
|
|
"kl": 0.9180602729320526,
|
|
"learning_rate": 3.1666666666666666e-05,
|
|
"loss": 0.0367,
|
|
"num_tokens": 534946.0,
|
|
"reward": 0.06937577575445175,
|
|
"reward_std": 0.3579734265804291,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.05766364932060242,
|
|
"rewards/belief_accuracy/std": 0.084672711789608,
|
|
"rewards/env_reward/mean": 0.14074449241161346,
|
|
"rewards/env_reward/std": 0.25905489921569824,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 216
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.1085,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.5056562423706055,
|
|
"kl": 1.428985133767128,
|
|
"learning_rate": 3.155555555555556e-05,
|
|
"loss": 0.0572,
|
|
"num_tokens": 537428.0,
|
|
"reward": -0.04300477355718613,
|
|
"reward_std": 0.1483583301305771,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.15466348826885223,
|
|
"rewards/env_reward/std": 0.09890555590391159,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 24.0,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.109,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7241830825805664,
|
|
"kl": 0.9578761979937553,
|
|
"learning_rate": 3.144444444444445e-05,
|
|
"loss": 0.0383,
|
|
"num_tokens": 539948.0,
|
|
"reward": -1.9331963062286377,
|
|
"reward_std": 2.0602900981903076,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0716666653752327,
|
|
"rewards/belief_accuracy/std": 0.05666666850447655,
|
|
"rewards/env_reward/mean": -1.137130856513977,
|
|
"rewards/env_reward/std": 1.2618883848190308,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 218
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 28.0,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.1095,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.32493782043457,
|
|
"kl": 0.58867571875453,
|
|
"learning_rate": 3.1333333333333334e-05,
|
|
"loss": 0.0235,
|
|
"num_tokens": 542468.0,
|
|
"reward": -1.2151740789413452,
|
|
"reward_std": 2.491729736328125,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5976160764694214,
|
|
"rewards/env_reward/std": 1.6028647422790527,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 219
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.11,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.791263461112976,
|
|
"kl": 0.3568975552916527,
|
|
"learning_rate": 3.1222222222222225e-05,
|
|
"loss": 0.0143,
|
|
"num_tokens": 544900.0,
|
|
"reward": 0.03496697545051575,
|
|
"reward_std": 0.06415002793073654,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.17913591861724854,
|
|
"rewards/belief_accuracy/std": 0.021383339539170265,
|
|
"rewards/env_reward/mean": 0.3482498526573181,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.1105,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.383763313293457,
|
|
"kl": 0.8193067982792854,
|
|
"learning_rate": 3.111111111111111e-05,
|
|
"loss": 0.0328,
|
|
"num_tokens": 547414.0,
|
|
"reward": 1.203812599182129,
|
|
"reward_std": 0.6176812648773193,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9858750700950623,
|
|
"rewards/env_reward/std": 0.41178756952285767,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.111,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.43298602104187,
|
|
"kl": 1.0093542635440826,
|
|
"learning_rate": 3.1e-05,
|
|
"loss": 0.0404,
|
|
"num_tokens": 549923.0,
|
|
"reward": -0.0007572025060653687,
|
|
"reward_std": 0.016494423151016235,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10740740597248077,
|
|
"rewards/belief_accuracy/std": 0.014814812690019608,
|
|
"rewards/env_reward/mean": 0.19347669184207916,
|
|
"rewards/env_reward/std": 0.010300002992153168,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 222
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 29.0,
|
|
"completions/mean_terminated_length": 20.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.1115,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.343346118927002,
|
|
"kl": 0.7701031491160393,
|
|
"learning_rate": 3.088888888888889e-05,
|
|
"loss": 0.0308,
|
|
"num_tokens": 552439.0,
|
|
"reward": -1.5342886447906494,
|
|
"reward_std": 2.2832674980163574,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08815178275108337,
|
|
"rewards/belief_accuracy/std": 0.023696430027484894,
|
|
"rewards/env_reward/mean": -0.8382222652435303,
|
|
"rewards/env_reward/std": 1.4423881769180298,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 223
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.112,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0897319316864014,
|
|
"kl": 0.8843832314014435,
|
|
"learning_rate": 3.077777777777778e-05,
|
|
"loss": 0.0354,
|
|
"num_tokens": 554948.0,
|
|
"reward": 0.1540832221508026,
|
|
"reward_std": 0.3211406171321869,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08613713085651398,
|
|
"rewards/belief_accuracy/std": 0.02772573195397854,
|
|
"rewards/env_reward/mean": 0.25416308641433716,
|
|
"rewards/env_reward/std": 0.18371644616127014,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 224
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.237890243530273,
|
|
"kl": 0.8649509251117706,
|
|
"learning_rate": 3.066666666666667e-05,
|
|
"loss": 0.0346,
|
|
"num_tokens": 557456.0,
|
|
"reward": 1.5224132537841797,
|
|
"reward_std": 1.171297311782837,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0333574041724205,
|
|
"rewards/belief_accuracy/std": 0.13328517973423004,
|
|
"rewards/env_reward/mean": 1.060823678970337,
|
|
"rewards/env_reward/std": 0.900728166103363,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 225
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 29.25,
|
|
"completions/mean_terminated_length": 26.5,
|
|
"completions/min_length": 21.0,
|
|
"completions/min_terminated_length": 21.0,
|
|
"epoch": 0.113,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.398747205734253,
|
|
"kl": 0.5030911080539227,
|
|
"learning_rate": 3.055555555555556e-05,
|
|
"loss": 0.0201,
|
|
"num_tokens": 559973.0,
|
|
"reward": -0.10044729709625244,
|
|
"reward_std": 0.14265108108520508,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11636848002672195,
|
|
"rewards/env_reward/std": 0.09510072320699692,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.1135,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1962196826934814,
|
|
"kl": 0.7515930682420731,
|
|
"learning_rate": 3.044444444444445e-05,
|
|
"loss": 0.0301,
|
|
"num_tokens": 562501.0,
|
|
"reward": 0.33509939908981323,
|
|
"reward_std": 0.4751393795013428,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.406732976436615,
|
|
"rewards/env_reward/std": 0.3167595863342285,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 29.25,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 21.0,
|
|
"completions/min_terminated_length": 21.0,
|
|
"epoch": 0.114,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.724390745162964,
|
|
"kl": 1.3262446075677872,
|
|
"learning_rate": 3.0333333333333337e-05,
|
|
"loss": 0.053,
|
|
"num_tokens": 565018.0,
|
|
"reward": -2.383183717727661,
|
|
"reward_std": 2.9639039039611816,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3471225500106812,
|
|
"rewards/env_reward/std": 1.9085785150527954,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 228
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1145,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0508358478546143,
|
|
"kl": 0.805017001926899,
|
|
"learning_rate": 3.0222222222222225e-05,
|
|
"loss": 0.0322,
|
|
"num_tokens": 567513.0,
|
|
"reward": -1.0678391456604004,
|
|
"reward_std": 2.5887842178344727,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.4993927776813507,
|
|
"rewards/env_reward/std": 1.6675386428833008,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 229
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.115,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8914108276367188,
|
|
"kl": 1.2341727763414383,
|
|
"learning_rate": 3.0111111111111113e-05,
|
|
"loss": 0.0494,
|
|
"num_tokens": 570012.0,
|
|
"reward": -1.0324312448501587,
|
|
"reward_std": 0.03943846374750137,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5049540996551514,
|
|
"rewards/env_reward/std": 0.026292279362678528,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1155,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.649041175842285,
|
|
"kl": 0.9338645786046982,
|
|
"learning_rate": 3e-05,
|
|
"loss": 0.0374,
|
|
"num_tokens": 572521.0,
|
|
"reward": -2.383704423904419,
|
|
"reward_std": 2.9854514598846436,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.347469687461853,
|
|
"rewards/env_reward/std": 1.9234607219696045,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 231
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 26.75,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.116,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.132113933563232,
|
|
"kl": 1.6651656776666641,
|
|
"learning_rate": 2.988888888888889e-05,
|
|
"loss": 0.0666,
|
|
"num_tokens": 575028.0,
|
|
"reward": -0.17734336853027344,
|
|
"reward_std": 0.43010595440864563,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.09427107125520706,
|
|
"rewards/env_reward/std": 0.236448734998703,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 232
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1165,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.701383590698242,
|
|
"kl": 1.4364068657159805,
|
|
"learning_rate": 2.9777777777777777e-05,
|
|
"loss": 0.0575,
|
|
"num_tokens": 577518.0,
|
|
"reward": 0.2805197834968567,
|
|
"reward_std": 0.16961893439292908,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3703465461730957,
|
|
"rewards/env_reward/std": 0.11307929456233978,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.117,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.781160354614258,
|
|
"kl": 0.3255625441670418,
|
|
"learning_rate": 2.9666666666666672e-05,
|
|
"loss": 0.013,
|
|
"num_tokens": 579950.0,
|
|
"reward": 0.2083221822977066,
|
|
"reward_std": 0.367115318775177,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.15008686482906342,
|
|
"rewards/belief_accuracy/std": 0.12237177044153214,
|
|
"rewards/env_reward/mean": 0.40572187304496765,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 22.0,
|
|
"completions/min_terminated_length": 22.0,
|
|
"epoch": 0.1175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.328470468521118,
|
|
"kl": 1.0922381281852722,
|
|
"learning_rate": 2.955555555555556e-05,
|
|
"loss": 0.0437,
|
|
"num_tokens": 582468.0,
|
|
"reward": 0.41766709089279175,
|
|
"reward_std": 0.20472979545593262,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.46177807450294495,
|
|
"rewards/env_reward/std": 0.13648654520511627,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.118,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3024332523345947,
|
|
"kl": 1.729993849992752,
|
|
"learning_rate": 2.9444444444444448e-05,
|
|
"loss": 0.0692,
|
|
"num_tokens": 584951.0,
|
|
"reward": 0.5654071569442749,
|
|
"reward_std": 0.20379649102687836,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.13415177166461945,
|
|
"rewards/belief_accuracy/std": 0.06830354034900665,
|
|
"rewards/env_reward/mean": 0.6244083642959595,
|
|
"rewards/env_reward/std": 0.07622048258781433,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 236
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.1185,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8457822799682617,
|
|
"kl": 2.1087397560477257,
|
|
"learning_rate": 2.9333333333333336e-05,
|
|
"loss": 0.0843,
|
|
"num_tokens": 587433.0,
|
|
"reward": 1.2190449237823486,
|
|
"reward_std": 0.21189068257808685,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9960300326347351,
|
|
"rewards/env_reward/std": 0.1412605196237564,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.119,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.029239589348435402,
|
|
"kl": 0.5205878019332886,
|
|
"learning_rate": 2.9222222222222224e-05,
|
|
"loss": 0.0208,
|
|
"num_tokens": 589641.0,
|
|
"reward": 0.6929494738578796,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4286329746246338,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 29.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 29.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 18.75,
|
|
"completions/mean_terminated_length": 18.75,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1195,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.953148365020752,
|
|
"kl": 1.5010789930820465,
|
|
"learning_rate": 2.9111111111111112e-05,
|
|
"loss": 0.06,
|
|
"num_tokens": 592116.0,
|
|
"reward": -0.19302129745483398,
|
|
"reward_std": 0.11821135133504868,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.05465248227119446,
|
|
"rewards/env_reward/std": 0.07880757749080658,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.12,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.028769580647349358,
|
|
"kl": 0.5208476185798645,
|
|
"learning_rate": 2.9e-05,
|
|
"loss": 0.0208,
|
|
"num_tokens": 594324.0,
|
|
"reward": 0.7799785137176514,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.48665234446525574,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1205,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.084097146987915,
|
|
"kl": 1.6838389933109283,
|
|
"learning_rate": 2.8888888888888888e-05,
|
|
"loss": 0.0674,
|
|
"num_tokens": 596812.0,
|
|
"reward": -0.08287781476974487,
|
|
"reward_std": 3.244748115539551,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.15724816918373108,
|
|
"rewards/env_reward/std": 2.104832172393799,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 241
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 17.25,
|
|
"completions/mean_terminated_length": 12.333333969116211,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.121,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.273799180984497,
|
|
"kl": 1.8537000715732574,
|
|
"learning_rate": 2.877777777777778e-05,
|
|
"loss": 0.0741,
|
|
"num_tokens": 599281.0,
|
|
"reward": 0.5991692543029785,
|
|
"reward_std": 0.3846488893032074,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5827795267105103,
|
|
"rewards/env_reward/std": 0.25643259286880493,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 30.25,
|
|
"completions/mean_terminated_length": 25.0,
|
|
"completions/min_length": 25.0,
|
|
"completions/min_terminated_length": 25.0,
|
|
"epoch": 0.1215,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.259014129638672,
|
|
"kl": 1.056531861424446,
|
|
"learning_rate": 2.8666666666666668e-05,
|
|
"loss": 0.0423,
|
|
"num_tokens": 601802.0,
|
|
"reward": 1.1337945461273193,
|
|
"reward_std": 0.15877185761928558,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.9683631062507629,
|
|
"rewards/env_reward/std": 0.1437581330537796,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 243
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 32.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 32.0,
|
|
"epoch": 0.122,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7003579139709473,
|
|
"kl": 0.6380213499069214,
|
|
"learning_rate": 2.855555555555556e-05,
|
|
"loss": 0.0255,
|
|
"num_tokens": 604330.0,
|
|
"reward": 0.300573468208313,
|
|
"reward_std": 0.29817959666252136,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3837156891822815,
|
|
"rewards/env_reward/std": 0.1987864077091217,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 27.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.614965438842773,
|
|
"kl": 1.0555044412612915,
|
|
"learning_rate": 2.8444444444444447e-05,
|
|
"loss": 0.0422,
|
|
"num_tokens": 606840.0,
|
|
"reward": 0.2127276510000229,
|
|
"reward_std": 0.07096138596534729,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.32515180110931396,
|
|
"rewards/env_reward/std": 0.04730759561061859,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 18.5,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.123,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.199899673461914,
|
|
"kl": 1.0866071283817291,
|
|
"learning_rate": 2.8333333333333335e-05,
|
|
"loss": 0.0435,
|
|
"num_tokens": 609314.0,
|
|
"reward": 0.6166397929191589,
|
|
"reward_std": 0.013659524731338024,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5944265127182007,
|
|
"rewards/env_reward/std": 0.009106338024139404,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 17.33333396911621,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1235,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.5806828737258911,
|
|
"kl": 2.1766858994960785,
|
|
"learning_rate": 2.8222222222222223e-05,
|
|
"loss": 0.0871,
|
|
"num_tokens": 611798.0,
|
|
"reward": -0.33692148327827454,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.04128097742795944,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 20.25,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.124,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1608145236968994,
|
|
"kl": 0.6925233453512192,
|
|
"learning_rate": 2.811111111111111e-05,
|
|
"loss": 0.0277,
|
|
"num_tokens": 614279.0,
|
|
"reward": -0.17361339926719666,
|
|
"reward_std": 0.17766423523426056,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.06759107857942581,
|
|
"rewards/env_reward/std": 0.1184428334236145,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.5,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1245,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.571046352386475,
|
|
"kl": 1.0061021000146866,
|
|
"learning_rate": 2.8000000000000003e-05,
|
|
"loss": 0.0402,
|
|
"num_tokens": 616713.0,
|
|
"reward": 0.13527683913707733,
|
|
"reward_std": 0.11952438950538635,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1666666716337204,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.39018458127975464,
|
|
"rewards/env_reward/std": 0.07968293130397797,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.622580528259277,
|
|
"kl": 3.023313194513321,
|
|
"learning_rate": 2.788888888888889e-05,
|
|
"loss": 0.1209,
|
|
"num_tokens": 619160.0,
|
|
"reward": 1.0675362348556519,
|
|
"reward_std": 0.21132755279541016,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.8950241804122925,
|
|
"rewards/env_reward/std": 0.14088504016399384,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 250
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 500,
|
|
"num_input_tokens_seen": 619160,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 250,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|