16535 lines
590 KiB
JSON
16535 lines
590 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.25,
|
|
"eval_steps": 500,
|
|
"global_step": 500,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 29.0,
|
|
"completions/mean_terminated_length": 20.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.0005,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.32243695855140686,
|
|
"kl": 0.016345822252333164,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0007,
|
|
"num_tokens": 2516.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 29.75,
|
|
"completions/mean_terminated_length": 23.0,
|
|
"completions/min_length": 23.0,
|
|
"completions/min_terminated_length": 23.0,
|
|
"epoch": 0.001,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1738930642604828,
|
|
"kl": 0.0056577762588858604,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 5035.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 27.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.0015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 53.453521728515625,
|
|
"kl": 0.027107596397399902,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 0.0011,
|
|
"num_tokens": 7545.0,
|
|
"reward": -3.724677085876465,
|
|
"reward_std": 2.4506454467773438,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.212284803390503,
|
|
"rewards/env_reward/std": 1.5754303932189941,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 22.0,
|
|
"completions/min_terminated_length": 22.0,
|
|
"epoch": 0.002,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.2984156608581543,
|
|
"kl": 0.013630361296236515,
|
|
"learning_rate": 3e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 10063.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 61.420711517333984,
|
|
"kl": 0.0825746851041913,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 0.0033,
|
|
"num_tokens": 12536.0,
|
|
"reward": -3.895512342453003,
|
|
"reward_std": 2.1089749336242676,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.3261749744415283,
|
|
"rewards/env_reward/std": 1.3476500511169434,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 30.75,
|
|
"completions/mean_terminated_length": 30.33333396911621,
|
|
"completions/min_length": 29.0,
|
|
"completions/min_terminated_length": 29.0,
|
|
"epoch": 0.003,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.032736778259277,
|
|
"kl": 0.0035573970526456833,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 15059.0,
|
|
"reward": -2.084261417388916,
|
|
"reward_std": 3.3090696334838867,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.147840976715088,
|
|
"rewards/env_reward/std": 2.1386890411376953,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0035,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.20612499117851257,
|
|
"kl": 0.007132542319595814,
|
|
"learning_rate": 6e-06,
|
|
"loss": 0.0003,
|
|
"num_tokens": 17568.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 20.25,
|
|
"completions/mean_terminated_length": 20.25,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.004,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.10438346862793,
|
|
"kl": 0.05212839285377413,
|
|
"learning_rate": 7.000000000000001e-06,
|
|
"loss": 0.0021,
|
|
"num_tokens": 20049.0,
|
|
"reward": -3.4786999225616455,
|
|
"reward_std": 2.9425997734069824,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.04830002784729,
|
|
"rewards/env_reward/std": 1.90339994430542,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 28.0,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 0.0045,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.36320337653160095,
|
|
"kl": 0.005910599138587713,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 22569.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 20.75,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.005,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 35.37952423095703,
|
|
"kl": 0.214208863559179,
|
|
"learning_rate": 9e-06,
|
|
"loss": 0.0086,
|
|
"num_tokens": 25052.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0055,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 219.09710693359375,
|
|
"kl": 0.09427966503426433,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.0038,
|
|
"num_tokens": 27536.0,
|
|
"reward": -2.487870216369629,
|
|
"reward_std": 2.853968858718872,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.4169135093688965,
|
|
"rewards/env_reward/std": 1.8355563879013062,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 26.0,
|
|
"completions/mean_terminated_length": 20.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.006,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 5.1641130447387695,
|
|
"kl": 0.02741223480552435,
|
|
"learning_rate": 1.1000000000000001e-05,
|
|
"loss": 0.0011,
|
|
"num_tokens": 30040.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.722599029541016,
|
|
"kl": 0.16925985834677704,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 0.0068,
|
|
"num_tokens": 32510.0,
|
|
"reward": -2.37943172454834,
|
|
"reward_std": 2.9682364463806152,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3446213006973267,
|
|
"rewards/env_reward/std": 1.9114667177200317,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 22.0,
|
|
"completions/min_terminated_length": 22.0,
|
|
"epoch": 0.007,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1022278442978859,
|
|
"kl": 0.006297597661614418,
|
|
"learning_rate": 1.3000000000000001e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 35028.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.0075,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.032320525497198105,
|
|
"kl": 0.002568609546869993,
|
|
"learning_rate": 1.4000000000000001e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 37556.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 22.33333396911621,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.008,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 8.214491844177246,
|
|
"kl": 0.041143732611089945,
|
|
"learning_rate": 1.5e-05,
|
|
"loss": 0.0016,
|
|
"num_tokens": 40055.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.0085,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 39.8804817199707,
|
|
"kl": 0.04003936113440432,
|
|
"learning_rate": 1.6000000000000003e-05,
|
|
"loss": 0.0016,
|
|
"num_tokens": 42531.0,
|
|
"reward": -2.5680184364318848,
|
|
"reward_std": 2.750802993774414,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.11806440353393555,
|
|
"rewards/belief_accuracy/std": 0.03612881526350975,
|
|
"rewards/env_reward/mean": -1.4383834600448608,
|
|
"rewards/env_reward/std": 1.8049958944320679,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 28.25,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.009,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.495725393295288,
|
|
"kl": 0.019086187705397606,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 0.0008,
|
|
"num_tokens": 45044.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 22.75,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.0095,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 26.531925201416016,
|
|
"kl": 0.09912175685167313,
|
|
"learning_rate": 1.8e-05,
|
|
"loss": 0.004,
|
|
"num_tokens": 47535.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.011573791503906,
|
|
"kl": 0.0038544870913028717,
|
|
"learning_rate": 1.9e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 50063.0,
|
|
"reward": -2.320432662963867,
|
|
"reward_std": 3.037968873977661,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.305288553237915,
|
|
"rewards/env_reward/std": 1.9579919576644897,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0105,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 36.5081787109375,
|
|
"kl": 0.2546631218865514,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0102,
|
|
"num_tokens": 52561.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 30.5,
|
|
"completions/mean_terminated_length": 26.0,
|
|
"completions/min_length": 26.0,
|
|
"completions/min_terminated_length": 26.0,
|
|
"epoch": 0.011,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.0799552202224731,
|
|
"kl": 0.009861491620540619,
|
|
"learning_rate": 2.1e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 55083.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0115,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 23.75230598449707,
|
|
"kl": 0.20189414219930768,
|
|
"learning_rate": 2.2000000000000003e-05,
|
|
"loss": 0.0081,
|
|
"num_tokens": 57588.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 23.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.012,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 102.0042953491211,
|
|
"kl": 0.1681511290371418,
|
|
"learning_rate": 2.3000000000000003e-05,
|
|
"loss": 0.0067,
|
|
"num_tokens": 60080.0,
|
|
"reward": -1.766066074371338,
|
|
"reward_std": 2.126420736312866,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.9648774862289429,
|
|
"rewards/env_reward/std": 1.3593891859054565,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0125,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.4423506259918213,
|
|
"kl": 0.0637103128246963,
|
|
"learning_rate": 2.4e-05,
|
|
"loss": 0.0025,
|
|
"num_tokens": 62571.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 31.75,
|
|
"completions/mean_terminated_length": 31.0,
|
|
"completions/min_length": 31.0,
|
|
"completions/min_terminated_length": 31.0,
|
|
"epoch": 0.013,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.05712759494781494,
|
|
"kl": 0.005990173202008009,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 65098.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 28.25,
|
|
"completions/mean_terminated_length": 28.25,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.0135,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1384836584329605,
|
|
"kl": 0.018408390693366528,
|
|
"learning_rate": 2.6000000000000002e-05,
|
|
"loss": 0.0007,
|
|
"num_tokens": 67611.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.014,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.17841196060180664,
|
|
"kl": 0.008233492728322744,
|
|
"learning_rate": 2.7000000000000002e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 70139.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0145,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 14.524484634399414,
|
|
"kl": 0.07956769224256277,
|
|
"learning_rate": 2.8000000000000003e-05,
|
|
"loss": 0.0032,
|
|
"num_tokens": 72647.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 24.25,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.015,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.6826711297035217,
|
|
"kl": 0.05026988545432687,
|
|
"learning_rate": 2.9e-05,
|
|
"loss": 0.002,
|
|
"num_tokens": 75144.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0155,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11050.3515625,
|
|
"kl": 36.80695866746828,
|
|
"learning_rate": 3e-05,
|
|
"loss": 1.4723,
|
|
"num_tokens": 77637.0,
|
|
"reward": -2.443718194961548,
|
|
"reward_std": 2.9238996505737305,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.0741666704416275,
|
|
"rewards/belief_accuracy/std": 0.05166666582226753,
|
|
"rewards/env_reward/mean": -1.443312168121338,
|
|
"rewards/env_reward/std": 1.8071939945220947,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.016,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 5.319654941558838,
|
|
"kl": 0.1096202852204442,
|
|
"learning_rate": 3.1e-05,
|
|
"loss": 0.0044,
|
|
"num_tokens": 80130.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 24.25,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0165,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 28.577579498291016,
|
|
"kl": 0.04399943072348833,
|
|
"learning_rate": 3.2000000000000005e-05,
|
|
"loss": 0.0018,
|
|
"num_tokens": 82627.0,
|
|
"reward": -3.7981131076812744,
|
|
"reward_std": 2.3037734031677246,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.261242151260376,
|
|
"rewards/env_reward/std": 1.477515697479248,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.017,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 3.7505602836608887,
|
|
"kl": 0.04482424072921276,
|
|
"learning_rate": 3.3e-05,
|
|
"loss": 0.0018,
|
|
"num_tokens": 85109.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 16.25,
|
|
"completions/mean_terminated_length": 16.25,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 20.112499237060547,
|
|
"kl": 0.0021229138001217507,
|
|
"learning_rate": 3.4000000000000007e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 87574.0,
|
|
"reward": 0.1572304666042328,
|
|
"reward_std": 0.04570581018924713,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.014735294505953789,
|
|
"rewards/belief_accuracy/std": 0.09565715491771698,
|
|
"rewards/env_reward/mean": 0.10095755755901337,
|
|
"rewards/env_reward/std": 0.20054571330547333,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.018,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.24909250438213348,
|
|
"kl": 0.024185666348785162,
|
|
"learning_rate": 3.5e-05,
|
|
"loss": 0.001,
|
|
"num_tokens": 90072.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 30.75,
|
|
"completions/mean_terminated_length": 27.0,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 0.0185,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.509799003601074,
|
|
"kl": 0.01711271144449711,
|
|
"learning_rate": 3.6e-05,
|
|
"loss": 0.0007,
|
|
"num_tokens": 92595.0,
|
|
"reward": -3.6846251487731934,
|
|
"reward_std": 2.530749559402466,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.1855833530426025,
|
|
"rewards/env_reward/std": 1.6288331747055054,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 29.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 29.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 22.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.019,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.46738389134407043,
|
|
"kl": 0.012128827278502285,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 95085.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 27.5,
|
|
"completions/mean_terminated_length": 26.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.0195,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.194053888320923,
|
|
"kl": 0.039654724299907684,
|
|
"learning_rate": 3.8e-05,
|
|
"loss": 0.0016,
|
|
"num_tokens": 97595.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2551957964897156,
|
|
"kl": 0.02670608414337039,
|
|
"learning_rate": 3.9000000000000006e-05,
|
|
"loss": 0.0011,
|
|
"num_tokens": 100093.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 29.75,
|
|
"completions/mean_terminated_length": 29.0,
|
|
"completions/min_length": 26.0,
|
|
"completions/min_terminated_length": 26.0,
|
|
"epoch": 0.0205,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.4345109760761261,
|
|
"kl": 0.010095613077282906,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 102612.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.021,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.6849669218063354,
|
|
"kl": 0.08905280428007245,
|
|
"learning_rate": 4.1e-05,
|
|
"loss": 0.0036,
|
|
"num_tokens": 105099.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 24.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0215,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 113.153564453125,
|
|
"kl": 0.12807448720559478,
|
|
"learning_rate": 4.2e-05,
|
|
"loss": 0.0051,
|
|
"num_tokens": 107595.0,
|
|
"reward": -2.9136834144592285,
|
|
"reward_std": 2.423197031021118,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.700788974761963,
|
|
"rewards/env_reward/std": 1.5501903295516968,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.022,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1417529582977295,
|
|
"kl": 0.05178070580586791,
|
|
"learning_rate": 4.3e-05,
|
|
"loss": 0.0021,
|
|
"num_tokens": 110123.0,
|
|
"reward": -3.766486167907715,
|
|
"reward_std": 2.3670270442962646,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.2401576042175293,
|
|
"rewards/env_reward/std": 1.5196847915649414,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 14.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 31.787551879882812,
|
|
"kl": 0.1364445798099041,
|
|
"learning_rate": 4.4000000000000006e-05,
|
|
"loss": 0.0055,
|
|
"num_tokens": 112599.0,
|
|
"reward": -2.7055277824401855,
|
|
"reward_std": 2.6139395236968994,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.12398147583007812,
|
|
"rewards/belief_accuracy/std": 0.04796295985579491,
|
|
"rewards/env_reward/mean": -1.5182223320007324,
|
|
"rewards/env_reward/std": 1.736833095550537,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.023,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 809.3003540039062,
|
|
"kl": 1.457309697754681,
|
|
"learning_rate": 4.5e-05,
|
|
"loss": 0.0583,
|
|
"num_tokens": 115104.0,
|
|
"reward": -2.267341136932373,
|
|
"reward_std": 3.0976674556732178,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.2698941230773926,
|
|
"rewards/env_reward/std": 1.9977540969848633,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 14.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0235,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 17.53619384765625,
|
|
"kl": 1.0537898712791502,
|
|
"learning_rate": 4.600000000000001e-05,
|
|
"loss": 0.0422,
|
|
"num_tokens": 117580.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.024,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 187.8842010498047,
|
|
"kl": 0.3121867855079472,
|
|
"learning_rate": 4.7e-05,
|
|
"loss": 0.0125,
|
|
"num_tokens": 120073.0,
|
|
"reward": -3.6274335384368896,
|
|
"reward_std": 2.645132541656494,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.1474556922912598,
|
|
"rewards/env_reward/std": 1.705088496208191,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 28.75,
|
|
"completions/mean_terminated_length": 25.5,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.0245,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.377052307128906,
|
|
"kl": 0.10642453748732805,
|
|
"learning_rate": 4.8e-05,
|
|
"loss": 0.0043,
|
|
"num_tokens": 122588.0,
|
|
"reward": -3.7050957679748535,
|
|
"reward_std": 2.4898080825805664,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.199230432510376,
|
|
"rewards/env_reward/std": 1.6015390157699585,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 30.75,
|
|
"completions/mean_terminated_length": 27.0,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 0.025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.019148826599121,
|
|
"kl": 0.056871576234698296,
|
|
"learning_rate": 4.9e-05,
|
|
"loss": 0.0023,
|
|
"num_tokens": 125111.0,
|
|
"reward": -3.6691508293151855,
|
|
"reward_std": 2.5616979598999023,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.175267219543457,
|
|
"rewards/env_reward/std": 1.6494653224945068,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 29.25,
|
|
"completions/mean_terminated_length": 26.5,
|
|
"completions/min_length": 26.0,
|
|
"completions/min_terminated_length": 26.0,
|
|
"epoch": 0.0255,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.499008178710938,
|
|
"kl": 0.0715335039421916,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.0029,
|
|
"num_tokens": 127628.0,
|
|
"reward": -2.420839786529541,
|
|
"reward_std": 2.920422315597534,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.372226595878601,
|
|
"rewards/env_reward/std": 1.8795907497406006,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.026,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.560655117034912,
|
|
"kl": 0.017654206603765488,
|
|
"learning_rate": 4.9888888888888894e-05,
|
|
"loss": 0.0007,
|
|
"num_tokens": 130156.0,
|
|
"reward": -4.051011085510254,
|
|
"reward_std": 1.7979769706726074,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.4298410415649414,
|
|
"rewards/env_reward/std": 1.1403180360794067,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.0265,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.195871353149414,
|
|
"kl": 0.24096931191161275,
|
|
"learning_rate": 4.977777777777778e-05,
|
|
"loss": 0.0096,
|
|
"num_tokens": 132651.0,
|
|
"reward": -2.6885905265808105,
|
|
"reward_std": 2.6208035945892334,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.5507268905639648,
|
|
"rewards/env_reward/std": 1.6801002025604248,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.027,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 123.76203155517578,
|
|
"kl": 5.633732934948057,
|
|
"learning_rate": 4.966666666666667e-05,
|
|
"loss": 0.2253,
|
|
"num_tokens": 135150.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 29.75,
|
|
"completions/mean_terminated_length": 27.5,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 0.0275,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.095720291137695,
|
|
"kl": 0.04261765070259571,
|
|
"learning_rate": 4.955555555555556e-05,
|
|
"loss": 0.0017,
|
|
"num_tokens": 137669.0,
|
|
"reward": -3.5939033031463623,
|
|
"reward_std": 2.712193012237549,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.1251022815704346,
|
|
"rewards/env_reward/std": 1.7497954368591309,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 7.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 7.0,
|
|
"completions/max_terminated_length": 7.0,
|
|
"completions/mean_length": 6.5,
|
|
"completions/mean_terminated_length": 6.5,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.028,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.206400394439697,
|
|
"kl": 0.0018149956013076007,
|
|
"learning_rate": 4.9444444444444446e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 140095.0,
|
|
"reward": -0.6511554718017578,
|
|
"reward_std": 0.4664153754711151,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.2507702708244324,
|
|
"rewards/env_reward/std": 0.3109435439109802,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.0285,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.72509479522705,
|
|
"kl": 0.03556834487244487,
|
|
"learning_rate": 4.933333333333334e-05,
|
|
"loss": 0.0014,
|
|
"num_tokens": 142580.0,
|
|
"reward": -1.1361982822418213,
|
|
"reward_std": 2.5431196689605713,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5449655055999756,
|
|
"rewards/env_reward/std": 1.6370937824249268,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.029,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 69.71238708496094,
|
|
"kl": 0.12905889004468918,
|
|
"learning_rate": 4.922222222222222e-05,
|
|
"loss": 0.0052,
|
|
"num_tokens": 145053.0,
|
|
"reward": -3.697530746459961,
|
|
"reward_std": 2.5049378871917725,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.1941874027252197,
|
|
"rewards/env_reward/std": 1.61162531375885,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0295,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.306075572967529,
|
|
"kl": 0.05293075350346044,
|
|
"learning_rate": 4.9111111111111114e-05,
|
|
"loss": 0.0021,
|
|
"num_tokens": 147515.0,
|
|
"reward": -0.8994538187980652,
|
|
"reward_std": 0.14630256593227386,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.165370374917984,
|
|
"rewards/belief_accuracy/std": 0.04776628687977791,
|
|
"rewards/env_reward/mean": -0.30222848057746887,
|
|
"rewards/env_reward/std": 0.13543139398097992,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.75,
|
|
"completions/mean_terminated_length": 8.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.006049633026123,
|
|
"kl": 0.024995889314595843,
|
|
"learning_rate": 4.9e-05,
|
|
"loss": 0.001,
|
|
"num_tokens": 149950.0,
|
|
"reward": -0.2717297375202179,
|
|
"reward_std": 0.27656516432762146,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.03703703731298447,
|
|
"rewards/belief_accuracy/std": 0.04781460762023926,
|
|
"rewards/env_reward/mean": -0.14041242003440857,
|
|
"rewards/env_reward/std": 0.21106119453907013,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.0305,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.1231383085250854,
|
|
"kl": 0.25201990082859993,
|
|
"learning_rate": 4.888888888888889e-05,
|
|
"loss": 0.0101,
|
|
"num_tokens": 152433.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.031,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.847359657287598,
|
|
"kl": 0.3126356555148959,
|
|
"learning_rate": 4.8777777777777775e-05,
|
|
"loss": 0.0125,
|
|
"num_tokens": 154901.0,
|
|
"reward": -1.7808257341384888,
|
|
"reward_std": 3.659447193145752,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.9455505609512329,
|
|
"rewards/env_reward/std": 2.372274160385132,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0315,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.686541557312012,
|
|
"kl": 0.153579062782228,
|
|
"learning_rate": 4.866666666666667e-05,
|
|
"loss": 0.0061,
|
|
"num_tokens": 157399.0,
|
|
"reward": -2.4691736698150635,
|
|
"reward_std": 2.869215488433838,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.404449224472046,
|
|
"rewards/env_reward/std": 1.845564842224121,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 20.33333396911621,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.032,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 92.96546936035156,
|
|
"kl": 0.2878706678748131,
|
|
"learning_rate": 4.855555555555556e-05,
|
|
"loss": 0.0115,
|
|
"num_tokens": 159892.0,
|
|
"reward": -3.7632439136505127,
|
|
"reward_std": 2.373511791229248,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.2379961013793945,
|
|
"rewards/env_reward/std": 1.5240079164505005,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 27.5,
|
|
"completions/mean_terminated_length": 26.0,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.0325,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 36.191368103027344,
|
|
"kl": 0.35182441864162683,
|
|
"learning_rate": 4.844444444444445e-05,
|
|
"loss": 0.0141,
|
|
"num_tokens": 162402.0,
|
|
"reward": -2.2611498832702637,
|
|
"reward_std": 3.114436149597168,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10736748576164246,
|
|
"rewards/belief_accuracy/std": 0.014734972268342972,
|
|
"rewards/env_reward/mean": -1.255198359489441,
|
|
"rewards/env_reward/std": 2.0199925899505615,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 16.75,
|
|
"completions/mean_terminated_length": 11.666666984558105,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.033,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 228884.828125,
|
|
"kl": 468.09647609852254,
|
|
"learning_rate": 4.8333333333333334e-05,
|
|
"loss": 18.7239,
|
|
"num_tokens": 164869.0,
|
|
"reward": -3.877704381942749,
|
|
"reward_std": 2.1445908546447754,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.314302921295166,
|
|
"rewards/env_reward/std": 1.3713939189910889,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 22.0,
|
|
"completions/min_terminated_length": 22.0,
|
|
"epoch": 0.0335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.94432258605957,
|
|
"kl": 0.21442949026823044,
|
|
"learning_rate": 4.8222222222222225e-05,
|
|
"loss": 0.0086,
|
|
"num_tokens": 167387.0,
|
|
"reward": -3.75144624710083,
|
|
"reward_std": 2.397106885910034,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.230130910873413,
|
|
"rewards/env_reward/std": 1.5397380590438843,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 8.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.034,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.818193435668945,
|
|
"kl": 0.9112066635861993,
|
|
"learning_rate": 4.811111111111111e-05,
|
|
"loss": 0.0364,
|
|
"num_tokens": 169620.0,
|
|
"reward": 0.4229079484939575,
|
|
"reward_std": 0.2314292937517166,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.24860529601573944,
|
|
"rewards/env_reward/std": 0.154286190867424,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.0,
|
|
"completions/mean_terminated_length": 13.333333969116211,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0345,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 25.480619430541992,
|
|
"kl": 2.5357193499803543,
|
|
"learning_rate": 4.8e-05,
|
|
"loss": 0.1014,
|
|
"num_tokens": 172092.0,
|
|
"reward": -2.201890468597412,
|
|
"reward_std": 3.173243284225464,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.2262604236602783,
|
|
"rewards/env_reward/std": 2.04813814163208,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 25.0,
|
|
"completions/mean_terminated_length": 22.666667938232422,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.035,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.468518257141113,
|
|
"kl": 0.5803861692547798,
|
|
"learning_rate": 4.7888888888888886e-05,
|
|
"loss": 0.0232,
|
|
"num_tokens": 174592.0,
|
|
"reward": -2.594465732574463,
|
|
"reward_std": 2.7201738357543945,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.0878773033618927,
|
|
"rewards/belief_accuracy/std": 0.024245386943221092,
|
|
"rewards/env_reward/mean": -1.5163891315460205,
|
|
"rewards/env_reward/std": 1.7145698070526123,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 25.0,
|
|
"completions/mean_terminated_length": 22.666667938232422,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0355,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 46.88726806640625,
|
|
"kl": 0.7163544222712517,
|
|
"learning_rate": 4.7777777777777784e-05,
|
|
"loss": 0.0287,
|
|
"num_tokens": 177092.0,
|
|
"reward": -2.0240089893341064,
|
|
"reward_std": 3.3790602684020996,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.1076725721359253,
|
|
"rewards/env_reward/std": 2.1853580474853516,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.036,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.72477388381958,
|
|
"kl": 0.7021452663466334,
|
|
"learning_rate": 4.766666666666667e-05,
|
|
"loss": 0.0281,
|
|
"num_tokens": 179581.0,
|
|
"reward": -1.4259536266326904,
|
|
"reward_std": 2.3681116104125977,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7381357550621033,
|
|
"rewards/env_reward/std": 1.5208872556686401,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0365,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 26.196685791015625,
|
|
"kl": 0.6366847828030586,
|
|
"learning_rate": 4.755555555555556e-05,
|
|
"loss": 0.0255,
|
|
"num_tokens": 182076.0,
|
|
"reward": -1.664202332496643,
|
|
"reward_std": 2.311755418777466,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08500000089406967,
|
|
"rewards/belief_accuracy/std": 0.030000001192092896,
|
|
"rewards/env_reward/mean": -0.9311348795890808,
|
|
"rewards/env_reward/std": 1.4874457120895386,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.037,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 38.14860916137695,
|
|
"kl": 1.0752212293446064,
|
|
"learning_rate": 4.7444444444444445e-05,
|
|
"loss": 0.043,
|
|
"num_tokens": 184544.0,
|
|
"reward": -1.49713134765625,
|
|
"reward_std": 2.301912307739258,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7855876684188843,
|
|
"rewards/env_reward/std": 1.476274847984314,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.0375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1271560192108154,
|
|
"kl": 0.22124752588570118,
|
|
"learning_rate": 4.7333333333333336e-05,
|
|
"loss": 0.0088,
|
|
"num_tokens": 187072.0,
|
|
"reward": -2.7508177757263184,
|
|
"reward_std": 2.5399067401885986,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.5922119617462158,
|
|
"rewards/env_reward/std": 1.6259276866912842,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.038,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.755521774291992,
|
|
"kl": 0.16726691462099552,
|
|
"learning_rate": 4.722222222222222e-05,
|
|
"loss": 0.0067,
|
|
"num_tokens": 189600.0,
|
|
"reward": -3.8879446983337402,
|
|
"reward_std": 2.124109983444214,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.3211300373077393,
|
|
"rewards/env_reward/std": 1.357740044593811,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 27.0,
|
|
"completions/min_length": 23.0,
|
|
"completions/min_terminated_length": 23.0,
|
|
"epoch": 0.0385,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.24942930042743683,
|
|
"kl": 0.789710771292448,
|
|
"learning_rate": 4.711111111111111e-05,
|
|
"loss": 0.0316,
|
|
"num_tokens": 192108.0,
|
|
"reward": -4.949999809265137,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": -1.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -3.0,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": -2.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 16.75,
|
|
"completions/mean_terminated_length": 11.666666984558105,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.039,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.9244184494018555,
|
|
"kl": 1.1887651532888412,
|
|
"learning_rate": 4.7e-05,
|
|
"loss": 0.0476,
|
|
"num_tokens": 194575.0,
|
|
"reward": -1.6062259674072266,
|
|
"reward_std": 2.2595274448394775,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08573612570762634,
|
|
"rewards/belief_accuracy/std": 0.028527740389108658,
|
|
"rewards/env_reward/mean": -0.8910117149353027,
|
|
"rewards/env_reward/std": 1.4291024208068848,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 26.5,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0395,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.589632034301758,
|
|
"kl": 0.914489395916462,
|
|
"learning_rate": 4.6888888888888895e-05,
|
|
"loss": 0.0366,
|
|
"num_tokens": 197081.0,
|
|
"reward": -1.3840163946151733,
|
|
"reward_std": 2.4131362438201904,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.681010901927948,
|
|
"rewards/env_reward/std": 1.562245488166809,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 25.75,
|
|
"completions/mean_terminated_length": 19.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.04,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 23.94391441345215,
|
|
"kl": 0.7511968985199928,
|
|
"learning_rate": 4.677777777777778e-05,
|
|
"loss": 0.03,
|
|
"num_tokens": 199584.0,
|
|
"reward": -1.7008922100067139,
|
|
"reward_std": 2.489635705947876,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.9214280843734741,
|
|
"rewards/env_reward/std": 1.6092621088027954,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0405,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.980483055114746,
|
|
"kl": 1.2917132005095482,
|
|
"learning_rate": 4.666666666666667e-05,
|
|
"loss": 0.0517,
|
|
"num_tokens": 202050.0,
|
|
"reward": -1.1099207401275635,
|
|
"reward_std": 2.573901891708374,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5274472236633301,
|
|
"rewards/env_reward/std": 1.6579262018203735,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.041,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.656517028808594,
|
|
"kl": 1.1007941216230392,
|
|
"learning_rate": 4.6555555555555556e-05,
|
|
"loss": 0.044,
|
|
"num_tokens": 204526.0,
|
|
"reward": -0.9679015278816223,
|
|
"reward_std": 2.6547322273254395,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.43276768922805786,
|
|
"rewards/env_reward/std": 1.7114882469177246,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0415,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.2242937088012695,
|
|
"kl": 1.004544973373413,
|
|
"learning_rate": 4.644444444444445e-05,
|
|
"loss": 0.0402,
|
|
"num_tokens": 207005.0,
|
|
"reward": -2.1635308265686035,
|
|
"reward_std": 3.2175371646881104,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.200687289237976,
|
|
"rewards/env_reward/std": 2.077667474746704,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.042,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.178193092346191,
|
|
"kl": 0.9078696174547076,
|
|
"learning_rate": 4.633333333333333e-05,
|
|
"loss": 0.0363,
|
|
"num_tokens": 209475.0,
|
|
"reward": 0.19402220845222473,
|
|
"reward_std": 0.2724432051181793,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.00833333283662796,
|
|
"rewards/belief_accuracy/std": 0.10671874135732651,
|
|
"rewards/env_reward/mean": 0.08351479470729828,
|
|
"rewards/env_reward/std": 0.14911670982837677,
|
|
"rewards/format_valid/mean": 0.875,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 20.25,
|
|
"completions/mean_terminated_length": 20.25,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.0425,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.778254508972168,
|
|
"kl": 1.0725902691483498,
|
|
"learning_rate": 4.6222222222222224e-05,
|
|
"loss": 0.0429,
|
|
"num_tokens": 211956.0,
|
|
"reward": -0.8444531559944153,
|
|
"reward_std": 2.7435097694396973,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0925000011920929,
|
|
"rewards/belief_accuracy/std": 0.015000000596046448,
|
|
"rewards/env_reward/mean": -0.3696354031562805,
|
|
"rewards/env_reward/std": 1.7566334009170532,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.043,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.498073101043701,
|
|
"kl": 0.7945144101977348,
|
|
"learning_rate": 4.6111111111111115e-05,
|
|
"loss": 0.0318,
|
|
"num_tokens": 214458.0,
|
|
"reward": -0.4258846640586853,
|
|
"reward_std": 0.525246798992157,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07500000298023224,
|
|
"rewards/belief_accuracy/std": 0.05000000074505806,
|
|
"rewards/env_reward/mean": -0.15475642681121826,
|
|
"rewards/env_reward/std": 0.276262104511261,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0435,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.651465892791748,
|
|
"kl": 1.144854974001646,
|
|
"learning_rate": 4.600000000000001e-05,
|
|
"loss": 0.0458,
|
|
"num_tokens": 216952.0,
|
|
"reward": -2.7960398197174072,
|
|
"reward_std": 2.4879508018493652,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10818149149417877,
|
|
"rewards/belief_accuracy/std": 0.016362976282835007,
|
|
"rewards/env_reward/mean": -1.610163688659668,
|
|
"rewards/env_reward/std": 1.6060127019882202,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.044,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.523915767669678,
|
|
"kl": 1.0343455001711845,
|
|
"learning_rate": 4.588888888888889e-05,
|
|
"loss": 0.0414,
|
|
"num_tokens": 219437.0,
|
|
"reward": -2.3472089767456055,
|
|
"reward_std": 3.005443811416626,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3231394290924072,
|
|
"rewards/env_reward/std": 1.9362717866897583,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 16.75,
|
|
"completions/mean_terminated_length": 16.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0445,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.352187395095825,
|
|
"kl": 1.272004920989275,
|
|
"learning_rate": 4.577777777777778e-05,
|
|
"loss": 0.0509,
|
|
"num_tokens": 221904.0,
|
|
"reward": -1.3286750316619873,
|
|
"reward_std": 2.4145350456237793,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.6441167593002319,
|
|
"rewards/env_reward/std": 1.5708537101745605,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.045,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.992652654647827,
|
|
"kl": 1.1837435215711594,
|
|
"learning_rate": 4.566666666666667e-05,
|
|
"loss": 0.0473,
|
|
"num_tokens": 224393.0,
|
|
"reward": -0.807397723197937,
|
|
"reward_std": 2.8079702854156494,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.3299318850040436,
|
|
"rewards/env_reward/std": 1.8133907318115234,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 19.25,
|
|
"completions/mean_terminated_length": 19.25,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0455,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.360654354095459,
|
|
"kl": 1.3510248363018036,
|
|
"learning_rate": 4.555555555555556e-05,
|
|
"loss": 0.054,
|
|
"num_tokens": 226870.0,
|
|
"reward": 0.35237032175064087,
|
|
"reward_std": 1.0852247476577759,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.418246865272522,
|
|
"rewards/env_reward/std": 0.723483145236969,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 19.666667938232422,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.046,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.634124517440796,
|
|
"kl": 0.8390218988060951,
|
|
"learning_rate": 4.5444444444444444e-05,
|
|
"loss": 0.0336,
|
|
"num_tokens": 229361.0,
|
|
"reward": -2.2453417778015137,
|
|
"reward_std": 3.167144298553467,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.25522780418396,
|
|
"rewards/env_reward/std": 2.0450401306152344,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 14.25,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0465,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.345480352640152,
|
|
"kl": 1.7472785264253616,
|
|
"learning_rate": 4.5333333333333335e-05,
|
|
"loss": 0.0699,
|
|
"num_tokens": 231818.0,
|
|
"reward": 0.20606237649917603,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3207082748413086,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 21.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.047,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.007046222686768,
|
|
"kl": 1.7106561437249184,
|
|
"learning_rate": 4.522222222222223e-05,
|
|
"loss": 0.0684,
|
|
"num_tokens": 234305.0,
|
|
"reward": -1.3136588335037231,
|
|
"reward_std": 2.4297609329223633,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6632725596427917,
|
|
"rewards/env_reward/std": 1.5616451501846313,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 94
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0475,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.215799808502197,
|
|
"kl": 2.4182121604681015,
|
|
"learning_rate": 4.511111111111112e-05,
|
|
"loss": 0.0967,
|
|
"num_tokens": 236746.0,
|
|
"reward": -1.4073553085327148,
|
|
"reward_std": 2.4502437114715576,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7257369160652161,
|
|
"rewards/env_reward/std": 1.5773454904556274,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.048,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8741865158081055,
|
|
"kl": 1.375985711812973,
|
|
"learning_rate": 4.5e-05,
|
|
"loss": 0.055,
|
|
"num_tokens": 239220.0,
|
|
"reward": -1.8366073369979858,
|
|
"reward_std": 2.075605630874634,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.0119048357009888,
|
|
"rewards/env_reward/std": 1.325404167175293,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 96
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.0485,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7931768894195557,
|
|
"kl": 1.1252032294869423,
|
|
"learning_rate": 4.4888888888888894e-05,
|
|
"loss": 0.045,
|
|
"num_tokens": 241690.0,
|
|
"reward": -0.21447324752807617,
|
|
"reward_std": 0.08092716336250305,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.040351178497076035,
|
|
"rewards/env_reward/std": 0.053951445966959,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.049,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6995668411254883,
|
|
"kl": 0.2477953266352415,
|
|
"learning_rate": 4.477777777777778e-05,
|
|
"loss": 0.0099,
|
|
"num_tokens": 243898.0,
|
|
"reward": 0.10004599392414093,
|
|
"reward_std": 0.12990380823612213,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.03336399421095848,
|
|
"rewards/env_reward/std": 0.08660253882408142,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 24.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0495,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.9689114093780518,
|
|
"kl": 1.3716598898172379,
|
|
"learning_rate": 4.466666666666667e-05,
|
|
"loss": 0.0549,
|
|
"num_tokens": 246394.0,
|
|
"reward": 0.05885888263583183,
|
|
"reward_std": 0.17086723446846008,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11083333194255829,
|
|
"rewards/belief_accuracy/std": 0.021666666492819786,
|
|
"rewards/env_reward/mean": 0.2400725781917572,
|
|
"rewards/env_reward/std": 0.08001596480607986,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 99
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.792644500732422,
|
|
"kl": 1.8348833322525024,
|
|
"learning_rate": 4.4555555555555555e-05,
|
|
"loss": 0.0734,
|
|
"num_tokens": 248839.0,
|
|
"reward": 0.2527257204055786,
|
|
"reward_std": 0.15090236067771912,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.35181713104248047,
|
|
"rewards/env_reward/std": 0.10060158371925354,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0505,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8620173931121826,
|
|
"kl": 1.4036446511745453,
|
|
"learning_rate": 4.4444444444444447e-05,
|
|
"loss": 0.0561,
|
|
"num_tokens": 251307.0,
|
|
"reward": -1.31059730052948,
|
|
"reward_std": 2.4275147914886475,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.1158333271741867,
|
|
"rewards/belief_accuracy/std": 0.03166666254401207,
|
|
"rewards/env_reward/mean": -0.6337315440177917,
|
|
"rewards/env_reward/std": 1.577512264251709,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 101
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.051,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1169137954711914,
|
|
"kl": 1.8475644141435623,
|
|
"learning_rate": 4.433333333333334e-05,
|
|
"loss": 0.0739,
|
|
"num_tokens": 253767.0,
|
|
"reward": -1.3670077323913574,
|
|
"reward_std": 2.399441719055176,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6988385319709778,
|
|
"rewards/env_reward/std": 1.5415664911270142,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 102
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 7.75,
|
|
"completions/mean_terminated_length": 7.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0515,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.684938430786133,
|
|
"kl": 2.059985037893057,
|
|
"learning_rate": 4.422222222222222e-05,
|
|
"loss": 0.0824,
|
|
"num_tokens": 256198.0,
|
|
"reward": -1.4205896854400635,
|
|
"reward_std": 2.3623111248016357,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7345598340034485,
|
|
"rewards/env_reward/std": 1.516780972480774,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 103
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.5,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.052,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5143520832061768,
|
|
"kl": 1.3098777011036873,
|
|
"learning_rate": 4.4111111111111114e-05,
|
|
"loss": 0.0524,
|
|
"num_tokens": 258632.0,
|
|
"reward": -0.0835796445608139,
|
|
"reward_std": 0.2586938738822937,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0363982617855072,
|
|
"rewards/belief_accuracy/std": 0.11159241199493408,
|
|
"rewards/env_reward/mean": -0.016256578266620636,
|
|
"rewards/env_reward/std": 0.26623615622520447,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 14.5,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0525,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.2300004959106445,
|
|
"kl": 1.785375103354454,
|
|
"learning_rate": 4.4000000000000006e-05,
|
|
"loss": 0.0714,
|
|
"num_tokens": 261090.0,
|
|
"reward": -0.08198876678943634,
|
|
"reward_std": 0.8859658241271973,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.12867416441440582,
|
|
"rewards/env_reward/std": 0.5906438827514648,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 16.25,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.053,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8233327865600586,
|
|
"kl": 1.1763433814048767,
|
|
"learning_rate": 4.388888888888889e-05,
|
|
"loss": 0.0471,
|
|
"num_tokens": 263555.0,
|
|
"reward": -2.6301207542419434,
|
|
"reward_std": 2.697817087173462,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.511747121810913,
|
|
"rewards/env_reward/std": 1.7316814661026,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 106
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0535,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.215397357940674,
|
|
"kl": 1.5616333931684494,
|
|
"learning_rate": 4.377777777777778e-05,
|
|
"loss": 0.0625,
|
|
"num_tokens": 266021.0,
|
|
"reward": -0.27485907077789307,
|
|
"reward_std": 0.8942175507545471,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 9.395182132720947e-05,
|
|
"rewards/env_reward/std": 0.5961450934410095,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.054,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.469447612762451,
|
|
"kl": 2.5851728469133377,
|
|
"learning_rate": 4.3666666666666666e-05,
|
|
"loss": 0.1034,
|
|
"num_tokens": 268461.0,
|
|
"reward": -1.1452138423919678,
|
|
"reward_std": 2.582923650741577,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5509759187698364,
|
|
"rewards/env_reward/std": 1.6647002696990967,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 108
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0545,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.712915420532227,
|
|
"kl": 2.6475657522678375,
|
|
"learning_rate": 4.355555555555556e-05,
|
|
"loss": 0.1059,
|
|
"num_tokens": 270898.0,
|
|
"reward": 0.12694786489009857,
|
|
"reward_std": 0.002898484468460083,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.26796525716781616,
|
|
"rewards/env_reward/std": 0.0019323229789733887,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.055,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.97365140914917,
|
|
"kl": 2.896424412727356,
|
|
"learning_rate": 4.344444444444445e-05,
|
|
"loss": 0.1159,
|
|
"num_tokens": 273372.0,
|
|
"reward": -0.7330765128135681,
|
|
"reward_std": 0.20495304465293884,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.06111111119389534,
|
|
"rewards/belief_accuracy/std": 0.07777778059244156,
|
|
"rewards/env_reward/mean": -0.38732877373695374,
|
|
"rewards/env_reward/std": 0.12224072217941284,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 110
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.0555,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.209381580352783,
|
|
"kl": 1.7662896811962128,
|
|
"learning_rate": 4.3333333333333334e-05,
|
|
"loss": 0.0707,
|
|
"num_tokens": 275814.0,
|
|
"reward": 0.027455374598503113,
|
|
"reward_std": 0.4914630353450775,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2016369253396988,
|
|
"rewards/env_reward/std": 0.3276420533657074,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.056,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.021775960922241,
|
|
"kl": 2.54108564555645,
|
|
"learning_rate": 4.3222222222222226e-05,
|
|
"loss": 0.1016,
|
|
"num_tokens": 278282.0,
|
|
"reward": -0.2888333201408386,
|
|
"reward_std": 0.15247361361980438,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.009222209453582764,
|
|
"rewards/env_reward/std": 0.10164907574653625,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0565,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.275335311889648,
|
|
"kl": 1.6538867950439453,
|
|
"learning_rate": 4.311111111111111e-05,
|
|
"loss": 0.0662,
|
|
"num_tokens": 280752.0,
|
|
"reward": 0.437593936920166,
|
|
"reward_std": 0.39731013774871826,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.47506263852119446,
|
|
"rewards/env_reward/std": 0.2648734450340271,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.057,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.433754920959473,
|
|
"kl": 1.647656962275505,
|
|
"learning_rate": 4.3e-05,
|
|
"loss": 0.0659,
|
|
"num_tokens": 283204.0,
|
|
"reward": -1.3276481628417969,
|
|
"reward_std": 2.414963722229004,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6725987792015076,
|
|
"rewards/env_reward/std": 1.551644206047058,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 114
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 10.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0575,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6330208778381348,
|
|
"kl": 1.515267439186573,
|
|
"learning_rate": 4.2888888888888886e-05,
|
|
"loss": 0.0606,
|
|
"num_tokens": 285667.0,
|
|
"reward": -3.7085390090942383,
|
|
"reward_std": 2.482921838760376,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.201525926589966,
|
|
"rewards/env_reward/std": 1.5969480276107788,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 115
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.058,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3021349906921387,
|
|
"kl": 1.7087249606847763,
|
|
"learning_rate": 4.277777777777778e-05,
|
|
"loss": 0.0683,
|
|
"num_tokens": 288114.0,
|
|
"reward": 0.5330584049224854,
|
|
"reward_std": 0.35837167501449585,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5387057065963745,
|
|
"rewards/env_reward/std": 0.23891450464725494,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 6.5,
|
|
"completions/mean_terminated_length": 6.5,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0585,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.497950553894043,
|
|
"kl": 2.1271141320466995,
|
|
"learning_rate": 4.266666666666667e-05,
|
|
"loss": 0.0851,
|
|
"num_tokens": 290540.0,
|
|
"reward": 0.4702581763267517,
|
|
"reward_std": 0.44036781787872314,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4968388080596924,
|
|
"rewards/env_reward/std": 0.293578565120697,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.059,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.789796829223633,
|
|
"kl": 1.4464631527662277,
|
|
"learning_rate": 4.255555555555556e-05,
|
|
"loss": 0.0579,
|
|
"num_tokens": 293023.0,
|
|
"reward": -0.7475403547286987,
|
|
"reward_std": 2.8080575466156006,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0703703761100769,
|
|
"rewards/belief_accuracy/std": 0.059259265661239624,
|
|
"rewards/env_reward/mean": -0.3492862284183502,
|
|
"rewards/env_reward/std": 1.7720966339111328,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 118
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 9.25,
|
|
"completions/mean_terminated_length": 9.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0595,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.5602530241012573,
|
|
"kl": 1.6833766214549541,
|
|
"learning_rate": 4.2444444444444445e-05,
|
|
"loss": 0.0673,
|
|
"num_tokens": 295460.0,
|
|
"reward": -0.42602595686912537,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.10068397223949432,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.7347092628479,
|
|
"kl": 1.445710226893425,
|
|
"learning_rate": 4.233333333333334e-05,
|
|
"loss": 0.0578,
|
|
"num_tokens": 297907.0,
|
|
"reward": -1.0152814388275146,
|
|
"reward_std": 2.623145580291748,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.4643542170524597,
|
|
"rewards/env_reward/std": 1.6904305219650269,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 120
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0605,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.493462085723877,
|
|
"kl": 1.4399118982255459,
|
|
"learning_rate": 4.222222222222222e-05,
|
|
"loss": 0.0576,
|
|
"num_tokens": 300352.0,
|
|
"reward": -0.20259986817836761,
|
|
"reward_std": 0.06754998862743378,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.048266757279634476,
|
|
"rewards/env_reward/std": 0.04503332078456879,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.061,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4330313205718994,
|
|
"kl": 1.4334093481302261,
|
|
"learning_rate": 4.211111111111111e-05,
|
|
"loss": 0.0573,
|
|
"num_tokens": 302825.0,
|
|
"reward": -0.5075480937957764,
|
|
"reward_std": 0.6749432682991028,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.15503208339214325,
|
|
"rewards/env_reward/std": 0.44996219873428345,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.0615,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.5208353996276855,
|
|
"kl": 1.7931447178125381,
|
|
"learning_rate": 4.2e-05,
|
|
"loss": 0.0717,
|
|
"num_tokens": 305291.0,
|
|
"reward": -2.456667900085449,
|
|
"reward_std": 2.895482063293457,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3961119651794434,
|
|
"rewards/env_reward/std": 1.863360047340393,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 123
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.062,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7689969539642334,
|
|
"kl": 0.8019402623176575,
|
|
"learning_rate": 4.188888888888889e-05,
|
|
"loss": 0.0321,
|
|
"num_tokens": 307775.0,
|
|
"reward": -1.4924118518829346,
|
|
"reward_std": 2.360426187515259,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.782441258430481,
|
|
"rewards/env_reward/std": 1.5167045593261719,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 124
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 7.0,
|
|
"completions/mean_terminated_length": 7.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6446032524108887,
|
|
"kl": 1.4752652198076248,
|
|
"learning_rate": 4.177777777777778e-05,
|
|
"loss": 0.059,
|
|
"num_tokens": 310203.0,
|
|
"reward": 0.8273366689682007,
|
|
"reward_std": 0.6383920311927795,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7348911762237549,
|
|
"rewards/env_reward/std": 0.4255947172641754,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 21.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 12.5,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.063,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.112602233886719,
|
|
"kl": 1.6532337069511414,
|
|
"learning_rate": 4.166666666666667e-05,
|
|
"loss": 0.0661,
|
|
"num_tokens": 312653.0,
|
|
"reward": -0.10410824418067932,
|
|
"reward_std": 0.01273045688867569,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11392784118652344,
|
|
"rewards/env_reward/std": 0.008486974984407425,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 14.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0635,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.351461887359619,
|
|
"kl": 2.1480718851089478,
|
|
"learning_rate": 4.155555555555556e-05,
|
|
"loss": 0.0859,
|
|
"num_tokens": 315110.0,
|
|
"reward": -0.3739127516746521,
|
|
"reward_std": 0.14415279030799866,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.06594181805849075,
|
|
"rewards/env_reward/std": 0.09610186517238617,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 18.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.064,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8212032318115234,
|
|
"kl": 1.534121721982956,
|
|
"learning_rate": 4.144444444444445e-05,
|
|
"loss": 0.0614,
|
|
"num_tokens": 317584.0,
|
|
"reward": -0.19062533974647522,
|
|
"reward_std": 0.3150945007801056,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0893678218126297,
|
|
"rewards/belief_accuracy/std": 0.021264348179101944,
|
|
"rewards/env_reward/mean": 0.03081876039505005,
|
|
"rewards/env_reward/std": 0.24891482293605804,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 128
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0645,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.938203811645508,
|
|
"kl": 2.1303387582302094,
|
|
"learning_rate": 4.133333333333333e-05,
|
|
"loss": 0.0852,
|
|
"num_tokens": 320029.0,
|
|
"reward": -1.1215572357177734,
|
|
"reward_std": 2.5629398822784424,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.06083333492279053,
|
|
"rewards/belief_accuracy/std": 0.07833334058523178,
|
|
"rewards/env_reward/mean": -0.6177048683166504,
|
|
"rewards/env_reward/std": 1.588196873664856,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 129
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 4.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.400176048278809,
|
|
"kl": 1.8505046516656876,
|
|
"learning_rate": 4.1222222222222224e-05,
|
|
"loss": 0.074,
|
|
"num_tokens": 322471.0,
|
|
"reward": -0.12519629299640656,
|
|
"reward_std": 0.14092496037483215,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.09986913949251175,
|
|
"rewards/env_reward/std": 0.09394997358322144,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 31.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 31.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 17.25,
|
|
"completions/mean_terminated_length": 17.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0655,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.194300413131714,
|
|
"kl": 1.2793779149651527,
|
|
"learning_rate": 4.111111111111111e-05,
|
|
"loss": 0.0512,
|
|
"num_tokens": 324940.0,
|
|
"reward": -1.0001001358032227,
|
|
"reward_std": 2.700178623199463,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.45423343777656555,
|
|
"rewards/env_reward/std": 1.743279218673706,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 131
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.066,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.973412036895752,
|
|
"kl": 1.9231543093919754,
|
|
"learning_rate": 4.1e-05,
|
|
"loss": 0.0769,
|
|
"num_tokens": 327385.0,
|
|
"reward": -1.7763125896453857,
|
|
"reward_std": 2.129706382751465,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.9717084169387817,
|
|
"rewards/env_reward/std": 1.3618686199188232,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 132
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 13.25,
|
|
"completions/mean_terminated_length": 7.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.070490598678589,
|
|
"kl": 1.150221362709999,
|
|
"learning_rate": 4.088888888888889e-05,
|
|
"loss": 0.046,
|
|
"num_tokens": 329838.0,
|
|
"reward": 0.4534025192260742,
|
|
"reward_std": 1.0941553115844727,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4856016933917999,
|
|
"rewards/env_reward/std": 0.7294369339942932,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 10.333333969116211,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.067,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.878156065940857,
|
|
"kl": 1.4795889034867287,
|
|
"learning_rate": 4.0777777777777783e-05,
|
|
"loss": 0.0592,
|
|
"num_tokens": 332301.0,
|
|
"reward": -2.0403780937194824,
|
|
"reward_std": 3.3793559074401855,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.1185853481292725,
|
|
"rewards/env_reward/std": 2.1859495639801025,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 134
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0675,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1908632516860962,
|
|
"kl": 1.9114599525928497,
|
|
"learning_rate": 4.066666666666667e-05,
|
|
"loss": 0.0765,
|
|
"num_tokens": 334745.0,
|
|
"reward": 0.5449367761611938,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5466245412826538,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 10.666666984558105,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.068,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5480082035064697,
|
|
"kl": 1.5377977713942528,
|
|
"learning_rate": 4.055555555555556e-05,
|
|
"loss": 0.0615,
|
|
"num_tokens": 337209.0,
|
|
"reward": -1.2076761722564697,
|
|
"reward_std": 2.4953103065490723,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5926175117492676,
|
|
"rewards/env_reward/std": 1.6052173376083374,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 136
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.0685,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0419105291366577,
|
|
"kl": 1.1934361532330513,
|
|
"learning_rate": 4.0444444444444444e-05,
|
|
"loss": 0.0477,
|
|
"num_tokens": 339665.0,
|
|
"reward": -1.1166263818740845,
|
|
"reward_std": 2.606243848800659,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5319175720214844,
|
|
"rewards/env_reward/std": 1.6803356409072876,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 137
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.069,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.009521007537842,
|
|
"kl": 2.4323032796382904,
|
|
"learning_rate": 4.0333333333333336e-05,
|
|
"loss": 0.0973,
|
|
"num_tokens": 342138.0,
|
|
"reward": 0.6563852429389954,
|
|
"reward_std": 0.8735789656639099,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.057500001043081284,
|
|
"rewards/belief_accuracy/std": 0.08499999344348907,
|
|
"rewards/env_reward/mean": 0.5317568182945251,
|
|
"rewards/env_reward/std": 0.6170323491096497,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 138
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 8.75,
|
|
"completions/mean_terminated_length": 8.75,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.0695,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7213194370269775,
|
|
"kl": 3.0655910074710846,
|
|
"learning_rate": 4.022222222222222e-05,
|
|
"loss": 0.1226,
|
|
"num_tokens": 344573.0,
|
|
"reward": 0.5249032378196716,
|
|
"reward_std": 0.1243140697479248,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5332688093185425,
|
|
"rewards/env_reward/std": 0.0828760415315628,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.07,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.32563015818595886,
|
|
"kl": 2.0985984057188034,
|
|
"learning_rate": 4.011111111111111e-05,
|
|
"loss": 0.0839,
|
|
"num_tokens": 347006.0,
|
|
"reward": -0.07693907618522644,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1320406198501587,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 14.75,
|
|
"completions/mean_terminated_length": 14.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0705,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0964287742972374,
|
|
"kl": 1.2693939208984375,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.0508,
|
|
"num_tokens": 349465.0,
|
|
"reward": -0.02015012502670288,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.16989992558956146,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.071,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.09562604129314423,
|
|
"kl": 1.128716617822647,
|
|
"learning_rate": 3.9888888888888895e-05,
|
|
"loss": 0.0451,
|
|
"num_tokens": 351950.0,
|
|
"reward": 0.7584548592567444,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6889699697494507,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0715,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.745733261108398,
|
|
"kl": 0.7463721930980682,
|
|
"learning_rate": 3.977777777777778e-05,
|
|
"loss": 0.0299,
|
|
"num_tokens": 354455.0,
|
|
"reward": 0.3327590823173523,
|
|
"reward_std": 0.23017629981040955,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.405172735452652,
|
|
"rewards/env_reward/std": 0.1534508764743805,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.072,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.888156890869141,
|
|
"kl": 1.4033671617507935,
|
|
"learning_rate": 3.966666666666667e-05,
|
|
"loss": 0.0561,
|
|
"num_tokens": 356946.0,
|
|
"reward": -0.13394379615783691,
|
|
"reward_std": 0.41236963868141174,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.09403747320175171,
|
|
"rewards/env_reward/std": 0.27491310238838196,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 8.75,
|
|
"completions/mean_terminated_length": 8.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0725,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.65702486038208,
|
|
"kl": 1.6139360815286636,
|
|
"learning_rate": 3.9555555555555556e-05,
|
|
"loss": 0.0646,
|
|
"num_tokens": 359381.0,
|
|
"reward": -1.7238547801971436,
|
|
"reward_std": 2.1609649658203125,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.12072296440601349,
|
|
"rewards/belief_accuracy/std": 0.07126190513372421,
|
|
"rewards/env_reward/mean": -0.9077907204627991,
|
|
"rewards/env_reward/std": 1.3948062658309937,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 145
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.073,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8972437381744385,
|
|
"kl": 1.160056695342064,
|
|
"learning_rate": 3.944444444444445e-05,
|
|
"loss": 0.0464,
|
|
"num_tokens": 361855.0,
|
|
"reward": 0.13612942397594452,
|
|
"reward_std": 0.02916666865348816,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0972222238779068,
|
|
"rewards/belief_accuracy/std": 0.0055555556900799274,
|
|
"rewards/env_reward/mean": 0.2643640637397766,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 146
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0735,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.382909774780273,
|
|
"kl": 1.568796619772911,
|
|
"learning_rate": 3.933333333333333e-05,
|
|
"loss": 0.0628,
|
|
"num_tokens": 364331.0,
|
|
"reward": 0.800137996673584,
|
|
"reward_std": 0.3069959282875061,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7167587280273438,
|
|
"rewards/env_reward/std": 0.2046639323234558,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.074,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.64746379852295,
|
|
"kl": 0.8096725344657898,
|
|
"learning_rate": 3.922222222222223e-05,
|
|
"loss": 0.0324,
|
|
"num_tokens": 366833.0,
|
|
"reward": -0.15847638249397278,
|
|
"reward_std": 1.3163249492645264,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.07768243551254272,
|
|
"rewards/env_reward/std": 0.877549946308136,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 12.75,
|
|
"completions/mean_terminated_length": 12.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0745,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.473668098449707,
|
|
"kl": 2.7600976526737213,
|
|
"learning_rate": 3.9111111111111115e-05,
|
|
"loss": 0.1104,
|
|
"num_tokens": 369284.0,
|
|
"reward": -2.9569857120513916,
|
|
"reward_std": 2.3198444843292236,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.7296571731567383,
|
|
"rewards/env_reward/std": 1.4797673225402832,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 149
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 31.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 31.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 21.5,
|
|
"completions/mean_terminated_length": 21.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.437213897705078,
|
|
"kl": 1.6173148602247238,
|
|
"learning_rate": 3.9000000000000006e-05,
|
|
"loss": 0.0647,
|
|
"num_tokens": 371770.0,
|
|
"reward": 0.5740416049957275,
|
|
"reward_std": 0.233365997672081,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.13980931043624878,
|
|
"rewards/belief_accuracy/std": 0.07961863279342651,
|
|
"rewards/env_reward/mean": 0.6414797306060791,
|
|
"rewards/env_reward/std": 0.1132500022649765,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 150
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0755,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2341485172510147,
|
|
"kl": 1.3624602407217026,
|
|
"learning_rate": 3.888888888888889e-05,
|
|
"loss": 0.0545,
|
|
"num_tokens": 374253.0,
|
|
"reward": 0.2103109359741211,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3235406279563904,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 23.33333396911621,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.076,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.359372615814209,
|
|
"kl": 1.1529072970151901,
|
|
"learning_rate": 3.877777777777778e-05,
|
|
"loss": 0.0461,
|
|
"num_tokens": 376755.0,
|
|
"reward": 0.4146992564201355,
|
|
"reward_std": 0.46390998363494873,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.45979946851730347,
|
|
"rewards/env_reward/std": 0.3092733323574066,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0765,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.655915260314941,
|
|
"kl": 1.3275744514539838,
|
|
"learning_rate": 3.866666666666667e-05,
|
|
"loss": 0.0531,
|
|
"num_tokens": 379211.0,
|
|
"reward": -0.0012441501021385193,
|
|
"reward_std": 0.24833057820796967,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.23028355836868286,
|
|
"rewards/belief_accuracy/std": 0.09518812596797943,
|
|
"rewards/env_reward/mean": 0.42640429735183716,
|
|
"rewards/env_reward/std": 0.10868140310049057,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.077,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.993246555328369,
|
|
"kl": 1.3614933341741562,
|
|
"learning_rate": 3.855555555555556e-05,
|
|
"loss": 0.0545,
|
|
"num_tokens": 381699.0,
|
|
"reward": -0.3912268280982971,
|
|
"reward_std": 3.040301561355591,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.01915118098258972,
|
|
"rewards/env_reward/std": 1.9872325658798218,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 154
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.0775,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0937976986169815,
|
|
"kl": 1.2733041644096375,
|
|
"learning_rate": 3.844444444444444e-05,
|
|
"loss": 0.0509,
|
|
"num_tokens": 384182.0,
|
|
"reward": -0.1396826207637787,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.09021158516407013,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.078,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.676441669464111,
|
|
"kl": 0.9261074624955654,
|
|
"learning_rate": 3.8333333333333334e-05,
|
|
"loss": 0.037,
|
|
"num_tokens": 386677.0,
|
|
"reward": -1.3367525339126587,
|
|
"reward_std": 2.411214828491211,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6786683797836304,
|
|
"rewards/env_reward/std": 1.5492030382156372,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 156
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0785,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1922104358673096,
|
|
"kl": 0.0721854604780674,
|
|
"learning_rate": 3.8222222222222226e-05,
|
|
"loss": 0.0029,
|
|
"num_tokens": 389109.0,
|
|
"reward": 0.8821967244148254,
|
|
"reward_std": 0.15713486075401306,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.31481480598449707,
|
|
"rewards/belief_accuracy/std": 0.052378278225660324,
|
|
"rewards/env_reward/mean": 1.1844274997711182,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.079,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.775229454040527,
|
|
"kl": 1.4617139548063278,
|
|
"learning_rate": 3.811111111111112e-05,
|
|
"loss": 0.0585,
|
|
"num_tokens": 391575.0,
|
|
"reward": -0.6259194612503052,
|
|
"reward_std": 0.5253891348838806,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10740740597248077,
|
|
"rewards/belief_accuracy/std": 0.014814812690019608,
|
|
"rewards/env_reward/mean": -0.22329813241958618,
|
|
"rewards/env_reward/std": 0.3683049976825714,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 158
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0795,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7022299766540527,
|
|
"kl": 1.3450734540820122,
|
|
"learning_rate": 3.8e-05,
|
|
"loss": 0.0538,
|
|
"num_tokens": 394062.0,
|
|
"reward": -1.487056851387024,
|
|
"reward_std": 2.3527774810791016,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0871676579117775,
|
|
"rewards/belief_accuracy/std": 0.025664685294032097,
|
|
"rewards/env_reward/mean": -0.8087027072906494,
|
|
"rewards/env_reward/std": 1.5039740800857544,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 159
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.08,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.860130310058594,
|
|
"kl": 1.0795547626912594,
|
|
"learning_rate": 3.7888888888888894e-05,
|
|
"loss": 0.0432,
|
|
"num_tokens": 396557.0,
|
|
"reward": -0.01648128777742386,
|
|
"reward_std": 0.3920603096485138,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1723458170890808,
|
|
"rewards/env_reward/std": 0.2613735496997833,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 18.666667938232422,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0805,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8319482803344727,
|
|
"kl": 1.2551886662840843,
|
|
"learning_rate": 3.777777777777778e-05,
|
|
"loss": 0.0502,
|
|
"num_tokens": 399045.0,
|
|
"reward": 0.17859038710594177,
|
|
"reward_std": 0.8184104561805725,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3023936152458191,
|
|
"rewards/env_reward/std": 0.5456069707870483,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.081,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.120696783065796,
|
|
"kl": 0.8586160615086555,
|
|
"learning_rate": 3.766666666666667e-05,
|
|
"loss": 0.0343,
|
|
"num_tokens": 401559.0,
|
|
"reward": 0.4954003691673279,
|
|
"reward_std": 0.3572309911251068,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10083333402872086,
|
|
"rewards/belief_accuracy/std": 0.0016666651936247945,
|
|
"rewards/env_reward/mean": 0.5111002922058105,
|
|
"rewards/env_reward/std": 0.24086414277553558,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 162
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 21.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0815,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.276159286499023,
|
|
"kl": 2.0177499651908875,
|
|
"learning_rate": 3.7555555555555554e-05,
|
|
"loss": 0.0807,
|
|
"num_tokens": 404022.0,
|
|
"reward": -0.14451055228710175,
|
|
"reward_std": 0.07916668057441711,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08888889104127884,
|
|
"rewards/belief_accuracy/std": 0.02222222276031971,
|
|
"rewards/env_reward/mean": 0.06060408055782318,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 163
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.082,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.653500080108643,
|
|
"kl": 1.8464947640895844,
|
|
"learning_rate": 3.7444444444444446e-05,
|
|
"loss": 0.0739,
|
|
"num_tokens": 406485.0,
|
|
"reward": 0.27019041776657104,
|
|
"reward_std": 0.23719999194145203,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11416666209697723,
|
|
"rewards/belief_accuracy/std": 0.028333332389593124,
|
|
"rewards/env_reward/mean": 0.3876269459724426,
|
|
"rewards/env_reward/std": 0.10980000346899033,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 164
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 19.33333396911621,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0825,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.24756693840026855,
|
|
"kl": 1.6211326867341995,
|
|
"learning_rate": 3.733333333333334e-05,
|
|
"loss": 0.0648,
|
|
"num_tokens": 408975.0,
|
|
"reward": 0.03895732760429382,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.20930489897727966,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.083,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.162034034729004,
|
|
"kl": 1.216068983078003,
|
|
"learning_rate": 3.722222222222222e-05,
|
|
"loss": 0.0486,
|
|
"num_tokens": 411464.0,
|
|
"reward": -0.5613082051277161,
|
|
"reward_std": 0.08749997615814209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.16170544922351837,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 166
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0835,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.356938362121582,
|
|
"kl": 2.8624762892723083,
|
|
"learning_rate": 3.7111111111111113e-05,
|
|
"loss": 0.1145,
|
|
"num_tokens": 413907.0,
|
|
"reward": 0.14997538924217224,
|
|
"reward_std": 0.7540647387504578,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2833169400691986,
|
|
"rewards/env_reward/std": 0.5027098655700684,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.084,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.261693477630615,
|
|
"kl": 1.933813601732254,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 0.0774,
|
|
"num_tokens": 416402.0,
|
|
"reward": -0.05508837103843689,
|
|
"reward_std": 0.11999882757663727,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.09027226269245148,
|
|
"rewards/belief_accuracy/std": 0.019455470144748688,
|
|
"rewards/env_reward/mean": 0.12298562377691269,
|
|
"rewards/env_reward/std": 0.08219999819993973,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 168
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 21.25,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0845,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.13167142868042,
|
|
"kl": 2.3502594381570816,
|
|
"learning_rate": 3.688888888888889e-05,
|
|
"loss": 0.094,
|
|
"num_tokens": 418887.0,
|
|
"reward": 0.03691243380308151,
|
|
"reward_std": 0.08749999105930328,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.2371082901954651,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 169
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.5,
|
|
"completions/mean_terminated_length": 15.333333969116211,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.085,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1629037857055664,
|
|
"kl": 1.5842487215995789,
|
|
"learning_rate": 3.677777777777778e-05,
|
|
"loss": 0.0634,
|
|
"num_tokens": 421365.0,
|
|
"reward": 0.5254287719726562,
|
|
"reward_std": 0.11898240447044373,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.533619225025177,
|
|
"rewards/env_reward/std": 0.07932159304618835,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 21.5,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0855,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.2538228034973145,
|
|
"kl": 1.3759911209344864,
|
|
"learning_rate": 3.6666666666666666e-05,
|
|
"loss": 0.055,
|
|
"num_tokens": 423851.0,
|
|
"reward": 1.0300487279891968,
|
|
"reward_std": 0.04658208787441254,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1066666692495346,
|
|
"rewards/belief_accuracy/std": 0.013333332724869251,
|
|
"rewards/env_reward/mean": 0.8791991472244263,
|
|
"rewards/env_reward/std": 0.01968872733414173,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 171
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 17.75,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.086,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.15501928329467773,
|
|
"kl": 1.7516742050647736,
|
|
"learning_rate": 3.655555555555556e-05,
|
|
"loss": 0.0701,
|
|
"num_tokens": 426322.0,
|
|
"reward": 0.030182331800460815,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2034548968076706,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 10.25,
|
|
"completions/mean_terminated_length": 10.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0865,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 37.33258056640625,
|
|
"kl": 8.386772617697716,
|
|
"learning_rate": 3.644444444444445e-05,
|
|
"loss": 0.3355,
|
|
"num_tokens": 428763.0,
|
|
"reward": -0.9025059342384338,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.41833725571632385,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.087,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.22313672304153442,
|
|
"kl": 1.8691215515136719,
|
|
"learning_rate": 3.633333333333333e-05,
|
|
"loss": 0.0748,
|
|
"num_tokens": 431237.0,
|
|
"reward": -0.16483666002750397,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.07344222813844681,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 18.25,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.1793341636657715,
|
|
"kl": 2.593918561935425,
|
|
"learning_rate": 3.6222222222222225e-05,
|
|
"loss": 0.1038,
|
|
"num_tokens": 433710.0,
|
|
"reward": -0.9837551116943359,
|
|
"reward_std": 2.6553149223327637,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.44333672523498535,
|
|
"rewards/env_reward/std": 1.712130069732666,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 175
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.088,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.10414294898509979,
|
|
"kl": 1.4019053727388382,
|
|
"learning_rate": 3.611111111111111e-05,
|
|
"loss": 0.0561,
|
|
"num_tokens": 436184.0,
|
|
"reward": -0.12919571995735168,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.0972028523683548,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0885,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.139967441558838,
|
|
"kl": 1.0769911333918571,
|
|
"learning_rate": 3.6e-05,
|
|
"loss": 0.0431,
|
|
"num_tokens": 438657.0,
|
|
"reward": 0.6967830657958984,
|
|
"reward_std": 0.08670443296432495,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6478554010391235,
|
|
"rewards/env_reward/std": 0.05780297517776489,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 10.666666984558105,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.089,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0620081424713135,
|
|
"kl": 1.0732092261314392,
|
|
"learning_rate": 3.5888888888888886e-05,
|
|
"loss": 0.0429,
|
|
"num_tokens": 441121.0,
|
|
"reward": -1.2789283990859985,
|
|
"reward_std": 2.598663568496704,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6401189565658569,
|
|
"rewards/env_reward/std": 1.6776195764541626,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 178
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.0895,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.18922147154808044,
|
|
"kl": 0.8863924369215965,
|
|
"learning_rate": 3.577777777777778e-05,
|
|
"loss": 0.0355,
|
|
"num_tokens": 443635.0,
|
|
"reward": 1.347588062286377,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.0817253589630127,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.09,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3570539951324463,
|
|
"kl": 1.9235362261533737,
|
|
"learning_rate": 3.566666666666667e-05,
|
|
"loss": 0.0769,
|
|
"num_tokens": 446101.0,
|
|
"reward": 0.05883501470088959,
|
|
"reward_std": 0.5488622784614563,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2225566804409027,
|
|
"rewards/env_reward/std": 0.3659081757068634,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 17.33333396911621,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0905,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.08723417669534683,
|
|
"kl": 1.3284604251384735,
|
|
"learning_rate": 3.555555555555556e-05,
|
|
"loss": 0.0531,
|
|
"num_tokens": 448585.0,
|
|
"reward": 0.6743147373199463,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6328765153884888,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 9.666666984558105,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.091,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3371059894561768,
|
|
"kl": 1.4546705782413483,
|
|
"learning_rate": 3.5444444444444445e-05,
|
|
"loss": 0.0582,
|
|
"num_tokens": 451046.0,
|
|
"reward": -0.3032863438129425,
|
|
"reward_std": 0.33148258924484253,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.018857555463910103,
|
|
"rewards/env_reward/std": 0.22098839282989502,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0915,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.911696195602417,
|
|
"kl": 1.4696582406759262,
|
|
"learning_rate": 3.5333333333333336e-05,
|
|
"loss": 0.0588,
|
|
"num_tokens": 453544.0,
|
|
"reward": -2.4303359985351562,
|
|
"reward_std": 2.9513607025146484,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3785573244094849,
|
|
"rewards/env_reward/std": 1.9012062549591064,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 183
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.0,
|
|
"completions/mean_terminated_length": 13.333333969116211,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.092,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6932320594787598,
|
|
"kl": 1.533248096704483,
|
|
"learning_rate": 3.522222222222222e-05,
|
|
"loss": 0.0613,
|
|
"num_tokens": 456016.0,
|
|
"reward": -1.5931193828582764,
|
|
"reward_std": 3.8826847076416016,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.8204129934310913,
|
|
"rewards/env_reward/std": 2.521214485168457,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 184
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 12.25,
|
|
"completions/mean_terminated_length": 12.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0925,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.075229167938232,
|
|
"kl": 1.968793198466301,
|
|
"learning_rate": 3.511111111111111e-05,
|
|
"loss": 0.0788,
|
|
"num_tokens": 458465.0,
|
|
"reward": -0.23590603470802307,
|
|
"reward_std": 0.2219301015138626,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.026062656193971634,
|
|
"rewards/env_reward/std": 0.14795339107513428,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.093,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.784778594970703,
|
|
"kl": 2.159162014722824,
|
|
"learning_rate": 3.5e-05,
|
|
"loss": 0.0864,
|
|
"num_tokens": 460898.0,
|
|
"reward": -0.3167000114917755,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.027799999341368675,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 29.25,
|
|
"completions/mean_terminated_length": 26.5,
|
|
"completions/min_length": 23.0,
|
|
"completions/min_terminated_length": 23.0,
|
|
"epoch": 0.0935,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.384214162826538,
|
|
"kl": 1.2219679579138756,
|
|
"learning_rate": 3.4888888888888895e-05,
|
|
"loss": 0.0489,
|
|
"num_tokens": 463415.0,
|
|
"reward": -1.0766644477844238,
|
|
"reward_std": 2.582223653793335,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5052763223648071,
|
|
"rewards/env_reward/std": 1.6631492376327515,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 187
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 14.75,
|
|
"completions/mean_terminated_length": 14.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.094,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3013856410980225,
|
|
"kl": 1.2444797977805138,
|
|
"learning_rate": 3.477777777777778e-05,
|
|
"loss": 0.0498,
|
|
"num_tokens": 465874.0,
|
|
"reward": -2.264209508895874,
|
|
"reward_std": 3.1193079948425293,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.2678064107894897,
|
|
"rewards/env_reward/std": 2.0125834941864014,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 188
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0945,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.398057460784912,
|
|
"kl": 1.1671398282051086,
|
|
"learning_rate": 3.466666666666667e-05,
|
|
"loss": 0.0467,
|
|
"num_tokens": 468342.0,
|
|
"reward": -0.32055363059043884,
|
|
"reward_std": 0.08688756823539734,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.03036908432841301,
|
|
"rewards/env_reward/std": 0.05792504921555519,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 13.5,
|
|
"completions/mean_terminated_length": 7.333333492279053,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.095,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1980197429656982,
|
|
"kl": 1.4500057846307755,
|
|
"learning_rate": 3.4555555555555556e-05,
|
|
"loss": 0.058,
|
|
"num_tokens": 470796.0,
|
|
"reward": 0.38532906770706177,
|
|
"reward_std": 0.15841148793697357,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4402194023132324,
|
|
"rewards/env_reward/std": 0.10560767352581024,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.0955,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.9431092739105225,
|
|
"kl": 1.4747809767723083,
|
|
"learning_rate": 3.444444444444445e-05,
|
|
"loss": 0.059,
|
|
"num_tokens": 473289.0,
|
|
"reward": -1.0166206359863281,
|
|
"reward_std": 0.02063235081732273,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.49441370368003845,
|
|
"rewards/env_reward/std": 0.0137548903003335,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.096,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.505349159240723,
|
|
"kl": 1.3912545293569565,
|
|
"learning_rate": 3.433333333333333e-05,
|
|
"loss": 0.0557,
|
|
"num_tokens": 475757.0,
|
|
"reward": -0.6259548664093018,
|
|
"reward_std": 2.8915553092956543,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08980958163738251,
|
|
"rewards/belief_accuracy/std": 0.02038082852959633,
|
|
"rewards/env_reward/mean": -0.22935077548027039,
|
|
"rewards/env_reward/std": 1.8575823307037354,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 192
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0965,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4287314414978027,
|
|
"kl": 1.0645422227680683,
|
|
"learning_rate": 3.4222222222222224e-05,
|
|
"loss": 0.0426,
|
|
"num_tokens": 478239.0,
|
|
"reward": -1.1361416578292847,
|
|
"reward_std": 2.5584716796875,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.18546631932258606,
|
|
"rewards/belief_accuracy/std": 0.059523556381464005,
|
|
"rewards/env_reward/mean": -0.3864951729774475,
|
|
"rewards/env_reward/std": 1.7521181106567383,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 193
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.097,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.376044511795044,
|
|
"kl": 0.6852857172489166,
|
|
"learning_rate": 3.411111111111111e-05,
|
|
"loss": 0.0274,
|
|
"num_tokens": 480748.0,
|
|
"reward": 0.5333235263824463,
|
|
"reward_std": 0.08749997615814209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.5680490136146545,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 194
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 13.75,
|
|
"completions/mean_terminated_length": 13.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0975,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.4659929275512695,
|
|
"kl": 1.7528847455978394,
|
|
"learning_rate": 3.4000000000000007e-05,
|
|
"loss": 0.0701,
|
|
"num_tokens": 483203.0,
|
|
"reward": 1.018233299255371,
|
|
"reward_std": 0.009551048278808594,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.862155556678772,
|
|
"rewards/env_reward/std": 0.006367385853081942,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.098,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.004715442657471,
|
|
"kl": 0.813011210411787,
|
|
"learning_rate": 3.388888888888889e-05,
|
|
"loss": 0.0325,
|
|
"num_tokens": 485711.0,
|
|
"reward": -2.4635062217712402,
|
|
"reward_std": 2.8720784187316895,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.4006710052490234,
|
|
"rewards/env_reward/std": 1.8473838567733765,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 196
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.0985,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.282201051712036,
|
|
"kl": 0.7132957000285387,
|
|
"learning_rate": 3.377777777777778e-05,
|
|
"loss": 0.0285,
|
|
"num_tokens": 488216.0,
|
|
"reward": -3.926431894302368,
|
|
"reward_std": 2.047135829925537,
|
|
"rewards/action_legal/mean": -0.75,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -2.346787929534912,
|
|
"rewards/env_reward/std": 1.3064239025115967,
|
|
"rewards/format_valid/mean": -1.375,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 197
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.099,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.8574674129486084,
|
|
"kl": 1.1133069694042206,
|
|
"learning_rate": 3.366666666666667e-05,
|
|
"loss": 0.0445,
|
|
"num_tokens": 490705.0,
|
|
"reward": 0.6392979621887207,
|
|
"reward_std": 0.2728678584098816,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.044744670391082764,
|
|
"rewards/belief_accuracy/std": 0.11051066219806671,
|
|
"rewards/env_reward/mean": 0.4948546886444092,
|
|
"rewards/env_reward/std": 0.41126659512519836,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 198
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.0995,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2241978645324707,
|
|
"kl": 1.400051310658455,
|
|
"learning_rate": 3.355555555555556e-05,
|
|
"loss": 0.056,
|
|
"num_tokens": 493194.0,
|
|
"reward": 0.16355225443840027,
|
|
"reward_std": 0.2303662747144699,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.29236820340156555,
|
|
"rewards/env_reward/std": 0.153577521443367,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 29.75,
|
|
"completions/mean_terminated_length": 27.5,
|
|
"completions/min_length": 25.0,
|
|
"completions/min_terminated_length": 25.0,
|
|
"epoch": 0.1,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1241228580474854,
|
|
"kl": 0.7815838046371937,
|
|
"learning_rate": 3.3444444444444443e-05,
|
|
"loss": 0.0313,
|
|
"num_tokens": 495713.0,
|
|
"reward": -1.4204142093658447,
|
|
"reward_std": 2.6858582496643066,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7344428300857544,
|
|
"rewards/env_reward/std": 1.739694356918335,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 200
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.880967617034912,
|
|
"kl": 1.6194000542163849,
|
|
"learning_rate": 3.3333333333333335e-05,
|
|
"loss": 0.0648,
|
|
"num_tokens": 498186.0,
|
|
"reward": -1.1433579921722412,
|
|
"reward_std": 2.5394091606140137,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5497387051582336,
|
|
"rewards/env_reward/std": 1.6346454620361328,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 201
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.101,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6157476902008057,
|
|
"kl": 1.4809669330716133,
|
|
"learning_rate": 3.322222222222222e-05,
|
|
"loss": 0.0592,
|
|
"num_tokens": 500648.0,
|
|
"reward": -0.7693363428115845,
|
|
"reward_std": 2.7953262329101562,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0716666653752327,
|
|
"rewards/belief_accuracy/std": 0.05666666850447655,
|
|
"rewards/env_reward/mean": -0.3612242341041565,
|
|
"rewards/env_reward/std": 1.7597646713256836,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 202
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 7.75,
|
|
"completions/mean_terminated_length": 7.75,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5044844150543213,
|
|
"kl": 0.5702618137001991,
|
|
"learning_rate": 3.311111111111112e-05,
|
|
"loss": 0.0228,
|
|
"num_tokens": 503079.0,
|
|
"reward": -0.09709322452545166,
|
|
"reward_std": 0.09302432835102081,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1680680364370346,
|
|
"rewards/belief_accuracy/std": 0.03100811131298542,
|
|
"rewards/env_reward/mean": 0.2380739152431488,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 24.0,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.102,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.225451707839966,
|
|
"kl": 0.6446680650115013,
|
|
"learning_rate": 3.3e-05,
|
|
"loss": 0.0258,
|
|
"num_tokens": 505599.0,
|
|
"reward": -1.0992940664291382,
|
|
"reward_std": 2.570491075515747,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.49119603633880615,
|
|
"rewards/env_reward/std": 1.673166036605835,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 204
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.69345760345459,
|
|
"kl": 0.9406535923480988,
|
|
"learning_rate": 3.2888888888888894e-05,
|
|
"loss": 0.0376,
|
|
"num_tokens": 508093.0,
|
|
"reward": -1.0868068933486938,
|
|
"reward_std": 2.6226813793182373,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5120379328727722,
|
|
"rewards/env_reward/std": 1.6912070512771606,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 205
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.103,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8141772747039795,
|
|
"kl": 0.38117800280451775,
|
|
"learning_rate": 3.277777777777778e-05,
|
|
"loss": 0.0152,
|
|
"num_tokens": 510301.0,
|
|
"reward": 0.5716937780380249,
|
|
"reward_std": 0.2175557017326355,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.34779584407806396,
|
|
"rewards/env_reward/std": 0.14503712952136993,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.1035,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4618332386016846,
|
|
"kl": 1.3801769241690636,
|
|
"learning_rate": 3.266666666666667e-05,
|
|
"loss": 0.0552,
|
|
"num_tokens": 512788.0,
|
|
"reward": -1.348587989807129,
|
|
"reward_std": 2.476418972015381,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6865587830543518,
|
|
"rewards/env_reward/std": 1.5944546461105347,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 207
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.104,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.705834865570068,
|
|
"kl": 1.0085995495319366,
|
|
"learning_rate": 3.2555555555555555e-05,
|
|
"loss": 0.0403,
|
|
"num_tokens": 515256.0,
|
|
"reward": -1.0385560989379883,
|
|
"reward_std": 2.6225454807281494,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.4798707962036133,
|
|
"rewards/env_reward/std": 1.6903735399246216,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 208
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 11.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1045,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.924132823944092,
|
|
"kl": 1.7609535232186317,
|
|
"learning_rate": 3.2444444444444446e-05,
|
|
"loss": 0.0704,
|
|
"num_tokens": 517722.0,
|
|
"reward": -1.3413997888565063,
|
|
"reward_std": 2.4143919944763184,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6817665696144104,
|
|
"rewards/env_reward/std": 1.5514785051345825,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 209
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.105,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.369988918304443,
|
|
"kl": 1.1772667318582535,
|
|
"learning_rate": 3.233333333333333e-05,
|
|
"loss": 0.0471,
|
|
"num_tokens": 520224.0,
|
|
"reward": 0.25493913888931274,
|
|
"reward_std": 0.33257579803466797,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3532927930355072,
|
|
"rewards/env_reward/std": 0.2217172235250473,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.1055,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5913333892822266,
|
|
"kl": 0.7529645264148712,
|
|
"learning_rate": 3.222222222222223e-05,
|
|
"loss": 0.0301,
|
|
"num_tokens": 522738.0,
|
|
"reward": -1.3859096765518188,
|
|
"reward_std": 2.4013755321502686,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7114397883415222,
|
|
"rewards/env_reward/std": 1.5432217121124268,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 211
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.106,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.363038539886475,
|
|
"kl": 1.2446223124861717,
|
|
"learning_rate": 3.2111111111111114e-05,
|
|
"loss": 0.0498,
|
|
"num_tokens": 525236.0,
|
|
"reward": -2.56288743019104,
|
|
"reward_std": 2.7684178352355957,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.466925024986267,
|
|
"rewards/env_reward/std": 1.7785577774047852,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 212
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 9.5,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.272139549255371,
|
|
"kl": 1.5829559713602066,
|
|
"learning_rate": 3.2000000000000005e-05,
|
|
"loss": 0.0633,
|
|
"num_tokens": 527450.0,
|
|
"reward": 1.3202344179153442,
|
|
"reward_std": 0.7838823199272156,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.84682297706604,
|
|
"rewards/env_reward/std": 0.5225882530212402,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.107,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.658717632293701,
|
|
"kl": 1.2298424392938614,
|
|
"learning_rate": 3.188888888888889e-05,
|
|
"loss": 0.0492,
|
|
"num_tokens": 529929.0,
|
|
"reward": -1.5011694431304932,
|
|
"reward_std": 2.299220323562622,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.78827965259552,
|
|
"rewards/env_reward/std": 1.474480390548706,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 214
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.059485912322998,
|
|
"kl": 0.9900188595056534,
|
|
"learning_rate": 3.177777777777778e-05,
|
|
"loss": 0.0396,
|
|
"num_tokens": 532434.0,
|
|
"reward": -2.2220005989074707,
|
|
"reward_std": 3.1529808044433594,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.2396671772003174,
|
|
"rewards/env_reward/std": 2.0346951484680176,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 215
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 28.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.108,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.903228282928467,
|
|
"kl": 0.9180602729320526,
|
|
"learning_rate": 3.1666666666666666e-05,
|
|
"loss": 0.0367,
|
|
"num_tokens": 534946.0,
|
|
"reward": 0.06937577575445175,
|
|
"reward_std": 0.3579734265804291,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.05766364932060242,
|
|
"rewards/belief_accuracy/std": 0.084672711789608,
|
|
"rewards/env_reward/mean": 0.14074449241161346,
|
|
"rewards/env_reward/std": 0.25905489921569824,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 216
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.1085,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.5056562423706055,
|
|
"kl": 1.428985133767128,
|
|
"learning_rate": 3.155555555555556e-05,
|
|
"loss": 0.0572,
|
|
"num_tokens": 537428.0,
|
|
"reward": -0.04300477355718613,
|
|
"reward_std": 0.1483583301305771,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.15466348826885223,
|
|
"rewards/env_reward/std": 0.09890555590391159,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 24.0,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.109,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7241830825805664,
|
|
"kl": 0.9578761979937553,
|
|
"learning_rate": 3.144444444444445e-05,
|
|
"loss": 0.0383,
|
|
"num_tokens": 539948.0,
|
|
"reward": -1.9331963062286377,
|
|
"reward_std": 2.0602900981903076,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0716666653752327,
|
|
"rewards/belief_accuracy/std": 0.05666666850447655,
|
|
"rewards/env_reward/mean": -1.137130856513977,
|
|
"rewards/env_reward/std": 1.2618883848190308,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 218
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 28.0,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.1095,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.32493782043457,
|
|
"kl": 0.58867571875453,
|
|
"learning_rate": 3.1333333333333334e-05,
|
|
"loss": 0.0235,
|
|
"num_tokens": 542468.0,
|
|
"reward": -1.2151740789413452,
|
|
"reward_std": 2.491729736328125,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5976160764694214,
|
|
"rewards/env_reward/std": 1.6028647422790527,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 219
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.11,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.791263461112976,
|
|
"kl": 0.3568975552916527,
|
|
"learning_rate": 3.1222222222222225e-05,
|
|
"loss": 0.0143,
|
|
"num_tokens": 544900.0,
|
|
"reward": 0.03496697545051575,
|
|
"reward_std": 0.06415002793073654,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.17913591861724854,
|
|
"rewards/belief_accuracy/std": 0.021383339539170265,
|
|
"rewards/env_reward/mean": 0.3482498526573181,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.1105,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.383763313293457,
|
|
"kl": 0.8193067982792854,
|
|
"learning_rate": 3.111111111111111e-05,
|
|
"loss": 0.0328,
|
|
"num_tokens": 547414.0,
|
|
"reward": 1.203812599182129,
|
|
"reward_std": 0.6176812648773193,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9858750700950623,
|
|
"rewards/env_reward/std": 0.41178756952285767,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.111,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.43298602104187,
|
|
"kl": 1.0093542635440826,
|
|
"learning_rate": 3.1e-05,
|
|
"loss": 0.0404,
|
|
"num_tokens": 549923.0,
|
|
"reward": -0.0007572025060653687,
|
|
"reward_std": 0.016494423151016235,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10740740597248077,
|
|
"rewards/belief_accuracy/std": 0.014814812690019608,
|
|
"rewards/env_reward/mean": 0.19347669184207916,
|
|
"rewards/env_reward/std": 0.010300002992153168,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 222
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 29.0,
|
|
"completions/mean_terminated_length": 20.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.1115,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.343346118927002,
|
|
"kl": 0.7701031491160393,
|
|
"learning_rate": 3.088888888888889e-05,
|
|
"loss": 0.0308,
|
|
"num_tokens": 552439.0,
|
|
"reward": -1.5342886447906494,
|
|
"reward_std": 2.2832674980163574,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08815178275108337,
|
|
"rewards/belief_accuracy/std": 0.023696430027484894,
|
|
"rewards/env_reward/mean": -0.8382222652435303,
|
|
"rewards/env_reward/std": 1.4423881769180298,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 223
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.112,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0897319316864014,
|
|
"kl": 0.8843832314014435,
|
|
"learning_rate": 3.077777777777778e-05,
|
|
"loss": 0.0354,
|
|
"num_tokens": 554948.0,
|
|
"reward": 0.1540832221508026,
|
|
"reward_std": 0.3211406171321869,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08613713085651398,
|
|
"rewards/belief_accuracy/std": 0.02772573195397854,
|
|
"rewards/env_reward/mean": 0.25416308641433716,
|
|
"rewards/env_reward/std": 0.18371644616127014,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 224
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.237890243530273,
|
|
"kl": 0.8649509251117706,
|
|
"learning_rate": 3.066666666666667e-05,
|
|
"loss": 0.0346,
|
|
"num_tokens": 557456.0,
|
|
"reward": 1.5224132537841797,
|
|
"reward_std": 1.171297311782837,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0333574041724205,
|
|
"rewards/belief_accuracy/std": 0.13328517973423004,
|
|
"rewards/env_reward/mean": 1.060823678970337,
|
|
"rewards/env_reward/std": 0.900728166103363,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 225
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 29.25,
|
|
"completions/mean_terminated_length": 26.5,
|
|
"completions/min_length": 21.0,
|
|
"completions/min_terminated_length": 21.0,
|
|
"epoch": 0.113,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.398747205734253,
|
|
"kl": 0.5030911080539227,
|
|
"learning_rate": 3.055555555555556e-05,
|
|
"loss": 0.0201,
|
|
"num_tokens": 559973.0,
|
|
"reward": -0.10044729709625244,
|
|
"reward_std": 0.14265108108520508,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11636848002672195,
|
|
"rewards/env_reward/std": 0.09510072320699692,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.1135,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1962196826934814,
|
|
"kl": 0.7515930682420731,
|
|
"learning_rate": 3.044444444444445e-05,
|
|
"loss": 0.0301,
|
|
"num_tokens": 562501.0,
|
|
"reward": 0.33509939908981323,
|
|
"reward_std": 0.4751393795013428,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.406732976436615,
|
|
"rewards/env_reward/std": 0.3167595863342285,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 29.25,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 21.0,
|
|
"completions/min_terminated_length": 21.0,
|
|
"epoch": 0.114,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.724390745162964,
|
|
"kl": 1.3262446075677872,
|
|
"learning_rate": 3.0333333333333337e-05,
|
|
"loss": 0.053,
|
|
"num_tokens": 565018.0,
|
|
"reward": -2.383183717727661,
|
|
"reward_std": 2.9639039039611816,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.3471225500106812,
|
|
"rewards/env_reward/std": 1.9085785150527954,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 228
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1145,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0508358478546143,
|
|
"kl": 0.805017001926899,
|
|
"learning_rate": 3.0222222222222225e-05,
|
|
"loss": 0.0322,
|
|
"num_tokens": 567513.0,
|
|
"reward": -1.0678391456604004,
|
|
"reward_std": 2.5887842178344727,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.4993927776813507,
|
|
"rewards/env_reward/std": 1.6675386428833008,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 229
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.115,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8914108276367188,
|
|
"kl": 1.2341727763414383,
|
|
"learning_rate": 3.0111111111111113e-05,
|
|
"loss": 0.0494,
|
|
"num_tokens": 570012.0,
|
|
"reward": -1.0324312448501587,
|
|
"reward_std": 0.03943846374750137,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5049540996551514,
|
|
"rewards/env_reward/std": 0.026292279362678528,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1155,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.649041175842285,
|
|
"kl": 0.9338645786046982,
|
|
"learning_rate": 3e-05,
|
|
"loss": 0.0374,
|
|
"num_tokens": 572521.0,
|
|
"reward": -2.383704423904419,
|
|
"reward_std": 2.9854514598846436,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.347469687461853,
|
|
"rewards/env_reward/std": 1.9234607219696045,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 231
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 26.75,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.116,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.132113933563232,
|
|
"kl": 1.6651656776666641,
|
|
"learning_rate": 2.988888888888889e-05,
|
|
"loss": 0.0666,
|
|
"num_tokens": 575028.0,
|
|
"reward": -0.17734336853027344,
|
|
"reward_std": 0.43010595440864563,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.09427107125520706,
|
|
"rewards/env_reward/std": 0.236448734998703,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 232
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1165,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.701383590698242,
|
|
"kl": 1.4364068657159805,
|
|
"learning_rate": 2.9777777777777777e-05,
|
|
"loss": 0.0575,
|
|
"num_tokens": 577518.0,
|
|
"reward": 0.2805197834968567,
|
|
"reward_std": 0.16961893439292908,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3703465461730957,
|
|
"rewards/env_reward/std": 0.11307929456233978,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.117,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.781160354614258,
|
|
"kl": 0.3255625441670418,
|
|
"learning_rate": 2.9666666666666672e-05,
|
|
"loss": 0.013,
|
|
"num_tokens": 579950.0,
|
|
"reward": 0.2083221822977066,
|
|
"reward_std": 0.367115318775177,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.15008686482906342,
|
|
"rewards/belief_accuracy/std": 0.12237177044153214,
|
|
"rewards/env_reward/mean": 0.40572187304496765,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 22.0,
|
|
"completions/min_terminated_length": 22.0,
|
|
"epoch": 0.1175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.328470468521118,
|
|
"kl": 1.0922381281852722,
|
|
"learning_rate": 2.955555555555556e-05,
|
|
"loss": 0.0437,
|
|
"num_tokens": 582468.0,
|
|
"reward": 0.41766709089279175,
|
|
"reward_std": 0.20472979545593262,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.46177807450294495,
|
|
"rewards/env_reward/std": 0.13648654520511627,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.118,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3024332523345947,
|
|
"kl": 1.729993849992752,
|
|
"learning_rate": 2.9444444444444448e-05,
|
|
"loss": 0.0692,
|
|
"num_tokens": 584951.0,
|
|
"reward": 0.5654071569442749,
|
|
"reward_std": 0.20379649102687836,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.13415177166461945,
|
|
"rewards/belief_accuracy/std": 0.06830354034900665,
|
|
"rewards/env_reward/mean": 0.6244083642959595,
|
|
"rewards/env_reward/std": 0.07622048258781433,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 236
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.1185,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8457822799682617,
|
|
"kl": 2.1087397560477257,
|
|
"learning_rate": 2.9333333333333336e-05,
|
|
"loss": 0.0843,
|
|
"num_tokens": 587433.0,
|
|
"reward": 1.2190449237823486,
|
|
"reward_std": 0.21189068257808685,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9960300326347351,
|
|
"rewards/env_reward/std": 0.1412605196237564,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.119,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.029239589348435402,
|
|
"kl": 0.5205878019332886,
|
|
"learning_rate": 2.9222222222222224e-05,
|
|
"loss": 0.0208,
|
|
"num_tokens": 589641.0,
|
|
"reward": 0.6929494738578796,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4286329746246338,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 29.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 29.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 18.75,
|
|
"completions/mean_terminated_length": 18.75,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1195,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.953148365020752,
|
|
"kl": 1.5010789930820465,
|
|
"learning_rate": 2.9111111111111112e-05,
|
|
"loss": 0.06,
|
|
"num_tokens": 592116.0,
|
|
"reward": -0.19302129745483398,
|
|
"reward_std": 0.11821135133504868,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.05465248227119446,
|
|
"rewards/env_reward/std": 0.07880757749080658,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.12,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.028769580647349358,
|
|
"kl": 0.5208476185798645,
|
|
"learning_rate": 2.9e-05,
|
|
"loss": 0.0208,
|
|
"num_tokens": 594324.0,
|
|
"reward": 0.7799785137176514,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.48665234446525574,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1205,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.084097146987915,
|
|
"kl": 1.6838389933109283,
|
|
"learning_rate": 2.8888888888888888e-05,
|
|
"loss": 0.0674,
|
|
"num_tokens": 596812.0,
|
|
"reward": -0.08287781476974487,
|
|
"reward_std": 3.244748115539551,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.15724816918373108,
|
|
"rewards/env_reward/std": 2.104832172393799,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 241
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 17.25,
|
|
"completions/mean_terminated_length": 12.333333969116211,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.121,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.273799180984497,
|
|
"kl": 1.8537000715732574,
|
|
"learning_rate": 2.877777777777778e-05,
|
|
"loss": 0.0741,
|
|
"num_tokens": 599281.0,
|
|
"reward": 0.5991692543029785,
|
|
"reward_std": 0.3846488893032074,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5827795267105103,
|
|
"rewards/env_reward/std": 0.25643259286880493,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 30.25,
|
|
"completions/mean_terminated_length": 25.0,
|
|
"completions/min_length": 25.0,
|
|
"completions/min_terminated_length": 25.0,
|
|
"epoch": 0.1215,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.259014129638672,
|
|
"kl": 1.056531861424446,
|
|
"learning_rate": 2.8666666666666668e-05,
|
|
"loss": 0.0423,
|
|
"num_tokens": 601802.0,
|
|
"reward": 1.1337945461273193,
|
|
"reward_std": 0.15877185761928558,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.9683631062507629,
|
|
"rewards/env_reward/std": 0.1437581330537796,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 243
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 32.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 32.0,
|
|
"epoch": 0.122,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7003579139709473,
|
|
"kl": 0.6380213499069214,
|
|
"learning_rate": 2.855555555555556e-05,
|
|
"loss": 0.0255,
|
|
"num_tokens": 604330.0,
|
|
"reward": 0.300573468208313,
|
|
"reward_std": 0.29817959666252136,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3837156891822815,
|
|
"rewards/env_reward/std": 0.1987864077091217,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 27.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.614965438842773,
|
|
"kl": 1.0555044412612915,
|
|
"learning_rate": 2.8444444444444447e-05,
|
|
"loss": 0.0422,
|
|
"num_tokens": 606840.0,
|
|
"reward": 0.2127276510000229,
|
|
"reward_std": 0.07096138596534729,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.32515180110931396,
|
|
"rewards/env_reward/std": 0.04730759561061859,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 18.5,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.123,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.199899673461914,
|
|
"kl": 1.0866071283817291,
|
|
"learning_rate": 2.8333333333333335e-05,
|
|
"loss": 0.0435,
|
|
"num_tokens": 609314.0,
|
|
"reward": 0.6166397929191589,
|
|
"reward_std": 0.013659524731338024,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5944265127182007,
|
|
"rewards/env_reward/std": 0.009106338024139404,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 17.33333396911621,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1235,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.5806828737258911,
|
|
"kl": 2.1766858994960785,
|
|
"learning_rate": 2.8222222222222223e-05,
|
|
"loss": 0.0871,
|
|
"num_tokens": 611798.0,
|
|
"reward": -0.33692148327827454,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.04128097742795944,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 20.25,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.124,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.1608145236968994,
|
|
"kl": 0.6925233453512192,
|
|
"learning_rate": 2.811111111111111e-05,
|
|
"loss": 0.0277,
|
|
"num_tokens": 614279.0,
|
|
"reward": -0.17361339926719666,
|
|
"reward_std": 0.17766423523426056,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.06759107857942581,
|
|
"rewards/env_reward/std": 0.1184428334236145,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.5,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1245,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.571046352386475,
|
|
"kl": 1.0061021000146866,
|
|
"learning_rate": 2.8000000000000003e-05,
|
|
"loss": 0.0402,
|
|
"num_tokens": 616713.0,
|
|
"reward": 0.13527683913707733,
|
|
"reward_std": 0.11952438950538635,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1666666716337204,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.39018458127975464,
|
|
"rewards/env_reward/std": 0.07968293130397797,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.622580528259277,
|
|
"kl": 3.023313194513321,
|
|
"learning_rate": 2.788888888888889e-05,
|
|
"loss": 0.1209,
|
|
"num_tokens": 619160.0,
|
|
"reward": 1.0675362348556519,
|
|
"reward_std": 0.21132755279541016,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.8950241804122925,
|
|
"rewards/env_reward/std": 0.14088504016399384,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1255,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.349266767501831,
|
|
"kl": 1.0463040620088577,
|
|
"learning_rate": 2.777777777777778e-05,
|
|
"loss": 0.0419,
|
|
"num_tokens": 621668.0,
|
|
"reward": 0.7296957969665527,
|
|
"reward_std": 0.5778632760047913,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6697972416877747,
|
|
"rewards/env_reward/std": 0.385242223739624,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 28.0,
|
|
"completions/mean_terminated_length": 24.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.126,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.983954668045044,
|
|
"kl": 1.3255593031644821,
|
|
"learning_rate": 2.7666666666666667e-05,
|
|
"loss": 0.053,
|
|
"num_tokens": 624180.0,
|
|
"reward": 0.7902753949165344,
|
|
"reward_std": 0.17459960281848907,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7101836204528809,
|
|
"rewards/env_reward/std": 0.11639970541000366,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1265,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.1040568351745605,
|
|
"kl": 1.1374231353402138,
|
|
"learning_rate": 2.7555555555555555e-05,
|
|
"loss": 0.0455,
|
|
"num_tokens": 626675.0,
|
|
"reward": 1.2806947231292725,
|
|
"reward_std": 0.3734249770641327,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.0371298789978027,
|
|
"rewards/env_reward/std": 0.24895000457763672,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 24.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.127,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.15152645111084,
|
|
"kl": 1.8364887535572052,
|
|
"learning_rate": 2.7444444444444443e-05,
|
|
"loss": 0.0735,
|
|
"num_tokens": 629171.0,
|
|
"reward": -1.082712173461914,
|
|
"reward_std": 2.5781917572021484,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5093082189559937,
|
|
"rewards/env_reward/std": 1.660461187362671,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 254
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 25.25,
|
|
"completions/mean_terminated_length": 23.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1275,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1334175318479538,
|
|
"kl": 1.0996833890676498,
|
|
"learning_rate": 2.733333333333333e-05,
|
|
"loss": 0.044,
|
|
"num_tokens": 631672.0,
|
|
"reward": 1.756896734237671,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.354597806930542,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.128,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.445394515991211,
|
|
"kl": 1.3922849670052528,
|
|
"learning_rate": 2.7222222222222223e-05,
|
|
"loss": 0.0557,
|
|
"num_tokens": 634146.0,
|
|
"reward": 0.24904996156692505,
|
|
"reward_std": 1.0076991319656372,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3493666648864746,
|
|
"rewards/env_reward/std": 0.6717994809150696,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 21.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1285,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.241879463195801,
|
|
"kl": 1.8242901861667633,
|
|
"learning_rate": 2.7111111111111114e-05,
|
|
"loss": 0.073,
|
|
"num_tokens": 636602.0,
|
|
"reward": -0.19289076328277588,
|
|
"reward_std": 3.193817377090454,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.08390611410140991,
|
|
"rewards/env_reward/std": 2.0712993144989014,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 257
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.129,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.70985221862793,
|
|
"kl": 1.6169143170118332,
|
|
"learning_rate": 2.7000000000000002e-05,
|
|
"loss": 0.0647,
|
|
"num_tokens": 639078.0,
|
|
"reward": 0.679303765296936,
|
|
"reward_std": 0.7435195446014404,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6362025737762451,
|
|
"rewards/env_reward/std": 0.4956797957420349,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1295,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5632526874542236,
|
|
"kl": 1.0557297468185425,
|
|
"learning_rate": 2.688888888888889e-05,
|
|
"loss": 0.0422,
|
|
"num_tokens": 641567.0,
|
|
"reward": -0.45305585861206055,
|
|
"reward_std": 3.029899835586548,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.08953723311424255,
|
|
"rewards/env_reward/std": 1.962233066558838,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 259
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.13,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.4922609329223633,
|
|
"kl": 1.2477368414402008,
|
|
"learning_rate": 2.677777777777778e-05,
|
|
"loss": 0.0499,
|
|
"num_tokens": 644066.0,
|
|
"reward": -2.3447818756103516,
|
|
"reward_std": 3.0328142642974854,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.09054364264011383,
|
|
"rewards/belief_accuracy/std": 0.018912728875875473,
|
|
"rewards/env_reward/mean": -1.3446006774902344,
|
|
"rewards/env_reward/std": 1.92401123046875,
|
|
"rewards/format_valid/mean": -0.625,
|
|
"rewards/format_valid/std": 1.6007810831069946,
|
|
"step": 260
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1305,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4958338737487793,
|
|
"kl": 1.4933066070079803,
|
|
"learning_rate": 2.6666666666666667e-05,
|
|
"loss": 0.0597,
|
|
"num_tokens": 646549.0,
|
|
"reward": 1.619019865989685,
|
|
"reward_std": 0.47918403148651123,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08888889104127884,
|
|
"rewards/belief_accuracy/std": 0.02222222276031971,
|
|
"rewards/env_reward/mean": 1.2362910509109497,
|
|
"rewards/env_reward/std": 0.28824105858802795,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 261
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 21.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.131,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8226537704467773,
|
|
"kl": 1.6106074303388596,
|
|
"learning_rate": 2.6555555555555555e-05,
|
|
"loss": 0.0644,
|
|
"num_tokens": 649035.0,
|
|
"reward": 0.7369977235794067,
|
|
"reward_std": 0.2108081877231598,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6746652126312256,
|
|
"rewards/env_reward/std": 0.14053881168365479,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 16.75,
|
|
"completions/mean_terminated_length": 16.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1315,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.192112922668457,
|
|
"kl": 2.2760011553764343,
|
|
"learning_rate": 2.6444444444444443e-05,
|
|
"loss": 0.091,
|
|
"num_tokens": 651502.0,
|
|
"reward": 0.622403085231781,
|
|
"reward_std": 0.43449920415878296,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5982687473297119,
|
|
"rewards/env_reward/std": 0.28966614603996277,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 16.25,
|
|
"completions/mean_terminated_length": 16.25,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.132,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.731379985809326,
|
|
"kl": 1.393263503909111,
|
|
"learning_rate": 2.633333333333333e-05,
|
|
"loss": 0.0557,
|
|
"num_tokens": 653967.0,
|
|
"reward": 0.10616789758205414,
|
|
"reward_std": 0.26564618945121765,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2541119456291199,
|
|
"rewards/env_reward/std": 0.17709745466709137,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 20.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1325,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.605635166168213,
|
|
"kl": 1.3393024802207947,
|
|
"learning_rate": 2.6222222222222226e-05,
|
|
"loss": 0.0536,
|
|
"num_tokens": 656472.0,
|
|
"reward": -1.1023221015930176,
|
|
"reward_std": 2.5652267932891846,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.49321478605270386,
|
|
"rewards/env_reward/std": 1.6726853847503662,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 265
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 27.0,
|
|
"completions/mean_terminated_length": 22.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.133,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.43408727645874,
|
|
"kl": 0.9438246488571167,
|
|
"learning_rate": 2.6111111111111114e-05,
|
|
"loss": 0.0378,
|
|
"num_tokens": 658980.0,
|
|
"reward": -0.46058040857315063,
|
|
"reward_std": 2.992946147918701,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.09455358982086182,
|
|
"rewards/env_reward/std": 1.9369643926620483,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 266
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.1335,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.037111759185791,
|
|
"kl": 1.522897057235241,
|
|
"learning_rate": 2.6000000000000002e-05,
|
|
"loss": 0.0609,
|
|
"num_tokens": 661478.0,
|
|
"reward": -0.8440333604812622,
|
|
"reward_std": 2.7461934089660645,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.3501889407634735,
|
|
"rewards/env_reward/std": 1.7726572751998901,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 267
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.134,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.4638547897338867,
|
|
"kl": 1.6383226662874222,
|
|
"learning_rate": 2.588888888888889e-05,
|
|
"loss": 0.0655,
|
|
"num_tokens": 663965.0,
|
|
"reward": -0.6360792517662048,
|
|
"reward_std": 0.01327502727508545,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.2407195121049881,
|
|
"rewards/env_reward/std": 0.008850008249282837,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 20.666667938232422,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.1345,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5850181579589844,
|
|
"kl": 1.492052584886551,
|
|
"learning_rate": 2.5777777777777778e-05,
|
|
"loss": 0.0597,
|
|
"num_tokens": 666459.0,
|
|
"reward": -0.4809204339981079,
|
|
"reward_std": 2.983957052230835,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.10811367630958557,
|
|
"rewards/env_reward/std": 1.9310635328292847,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 269
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 20.25,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.135,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.2055325508117676,
|
|
"kl": 2.1733334064483643,
|
|
"learning_rate": 2.5666666666666666e-05,
|
|
"loss": 0.0869,
|
|
"num_tokens": 668940.0,
|
|
"reward": 0.015844523906707764,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1938963532447815,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 8.75,
|
|
"completions/mean_terminated_length": 8.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1355,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.974431991577148,
|
|
"kl": 1.7352482229471207,
|
|
"learning_rate": 2.5555555555555554e-05,
|
|
"loss": 0.0694,
|
|
"num_tokens": 671151.0,
|
|
"reward": 1.4853503704071045,
|
|
"reward_std": 0.053439658135175705,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9569003582000732,
|
|
"rewards/env_reward/std": 0.03562644124031067,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 27.25,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.136,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.03952169418335,
|
|
"kl": 0.8288602530956268,
|
|
"learning_rate": 2.5444444444444442e-05,
|
|
"loss": 0.0332,
|
|
"num_tokens": 673660.0,
|
|
"reward": -0.24595004320144653,
|
|
"reward_std": 3.136033058166504,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.04853332042694092,
|
|
"rewards/env_reward/std": 2.032355546951294,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 272
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1365,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.225179672241211,
|
|
"kl": 1.4488344937562943,
|
|
"learning_rate": 2.5333333333333337e-05,
|
|
"loss": 0.058,
|
|
"num_tokens": 676143.0,
|
|
"reward": 1.1019805669784546,
|
|
"reward_std": 0.872570812702179,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08888889104127884,
|
|
"rewards/belief_accuracy/std": 0.02222222276031971,
|
|
"rewards/env_reward/mean": 0.8915981650352478,
|
|
"rewards/env_reward/std": 0.5619891881942749,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 273
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 24.25,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.137,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0150835514068604,
|
|
"kl": 1.9037371575832367,
|
|
"learning_rate": 2.5222222222222225e-05,
|
|
"loss": 0.0761,
|
|
"num_tokens": 678640.0,
|
|
"reward": 0.020011983811855316,
|
|
"reward_std": 0.2474244087934494,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07959593832492828,
|
|
"rewards/belief_accuracy/std": 0.04080813378095627,
|
|
"rewards/env_reward/mean": 0.15169985592365265,
|
|
"rewards/env_reward/std": 0.07499999552965164,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 274
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1375,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.30538198351860046,
|
|
"kl": 3.629801630973816,
|
|
"learning_rate": 2.5111111111111113e-05,
|
|
"loss": 0.1452,
|
|
"num_tokens": 681080.0,
|
|
"reward": 0.16994282603263855,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.29662856459617615,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.138,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.29680636525154114,
|
|
"kl": 1.799863338470459,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 0.072,
|
|
"num_tokens": 683569.0,
|
|
"reward": 0.978661835193634,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.8357745409011841,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 13.5,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1385,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.594503402709961,
|
|
"kl": 1.047157421708107,
|
|
"learning_rate": 2.488888888888889e-05,
|
|
"loss": 0.0419,
|
|
"num_tokens": 686023.0,
|
|
"reward": -0.408236563205719,
|
|
"reward_std": 0.6185159087181091,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.14728286862373352,
|
|
"rewards/belief_accuracy/std": 0.1718742847442627,
|
|
"rewards/env_reward/mean": -0.010925263166427612,
|
|
"rewards/env_reward/std": 0.3019493520259857,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.139,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.271994113922119,
|
|
"kl": 1.7188260853290558,
|
|
"learning_rate": 2.477777777777778e-05,
|
|
"loss": 0.0688,
|
|
"num_tokens": 688493.0,
|
|
"reward": -0.4882531464099884,
|
|
"reward_std": 2.9744977951049805,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.11300215125083923,
|
|
"rewards/env_reward/std": 1.9246653318405151,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 278
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1395,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 8.145206451416016,
|
|
"kl": 3.5787951350212097,
|
|
"learning_rate": 2.466666666666667e-05,
|
|
"loss": 0.1432,
|
|
"num_tokens": 690961.0,
|
|
"reward": -0.254070907831192,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.013952743262052536,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.14,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.3156006634235382,
|
|
"kl": 1.6092262268066406,
|
|
"learning_rate": 2.4555555555555557e-05,
|
|
"loss": 0.0644,
|
|
"num_tokens": 693446.0,
|
|
"reward": 1.0831952095031738,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9054635763168335,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1405,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0812392234802246,
|
|
"kl": 2.202967807650566,
|
|
"learning_rate": 2.4444444444444445e-05,
|
|
"loss": 0.0881,
|
|
"num_tokens": 695916.0,
|
|
"reward": -0.5145425200462341,
|
|
"reward_std": 0.07837500423192978,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.15969499945640564,
|
|
"rewards/env_reward/std": 0.052250005304813385,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 26.25,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.141,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6371960639953613,
|
|
"kl": 0.2116354387253523,
|
|
"learning_rate": 2.4333333333333336e-05,
|
|
"loss": 0.0085,
|
|
"num_tokens": 698421.0,
|
|
"reward": -0.10934996604919434,
|
|
"reward_std": 0.24821718037128448,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11043336987495422,
|
|
"rewards/env_reward/std": 0.16547811031341553,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1415,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.447051525115967,
|
|
"kl": 1.6157226860523224,
|
|
"learning_rate": 2.4222222222222224e-05,
|
|
"loss": 0.0646,
|
|
"num_tokens": 700884.0,
|
|
"reward": 0.7572240829467773,
|
|
"reward_std": 0.34561485052108765,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6881493330001831,
|
|
"rewards/env_reward/std": 0.2304098904132843,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 18.75,
|
|
"completions/mean_terminated_length": 14.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.142,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.0768420696258545,
|
|
"kl": 1.9094894081354141,
|
|
"learning_rate": 2.4111111111111113e-05,
|
|
"loss": 0.0764,
|
|
"num_tokens": 703359.0,
|
|
"reward": -0.14820998907089233,
|
|
"reward_std": 3.201193332672119,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11369338631629944,
|
|
"rewards/env_reward/std": 2.0757956504821777,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 284
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 30.25,
|
|
"completions/mean_terminated_length": 25.0,
|
|
"completions/min_length": 25.0,
|
|
"completions/min_terminated_length": 25.0,
|
|
"epoch": 0.1425,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.422678232192993,
|
|
"kl": 1.0840960815548897,
|
|
"learning_rate": 2.4e-05,
|
|
"loss": 0.0434,
|
|
"num_tokens": 705880.0,
|
|
"reward": -1.8789894580841064,
|
|
"reward_std": 3.5502729415893555,
|
|
"rewards/action_legal/mean": -0.5,
|
|
"rewards/action_legal/std": 0.5773502588272095,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.0109930038452148,
|
|
"rewards/env_reward/std": 2.2995729446411133,
|
|
"rewards/format_valid/mean": -0.75,
|
|
"rewards/format_valid/std": 1.4433757066726685,
|
|
"step": 285
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 19.5,
|
|
"completions/mean_terminated_length": 15.333333969116211,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.17799264192581177,
|
|
"kl": 1.9597035348415375,
|
|
"learning_rate": 2.3888888888888892e-05,
|
|
"loss": 0.0784,
|
|
"num_tokens": 708358.0,
|
|
"reward": 0.1835106611251831,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3056737780570984,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 20.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1435,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.08594390004873276,
|
|
"kl": 1.2184199467301369,
|
|
"learning_rate": 2.377777777777778e-05,
|
|
"loss": 0.0487,
|
|
"num_tokens": 710838.0,
|
|
"reward": 0.28326112031936646,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.37217411398887634,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.144,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.5200910568237305,
|
|
"kl": 1.4814732670783997,
|
|
"learning_rate": 2.3666666666666668e-05,
|
|
"loss": 0.0593,
|
|
"num_tokens": 713286.0,
|
|
"reward": -1.652359962463379,
|
|
"reward_std": 2.199069023132324,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.889073371887207,
|
|
"rewards/env_reward/std": 1.4077305793762207,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 288
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 29.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 29.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 19.5,
|
|
"completions/mean_terminated_length": 19.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1445,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1203649044036865,
|
|
"kl": 1.7035606056451797,
|
|
"learning_rate": 2.3555555555555556e-05,
|
|
"loss": 0.0681,
|
|
"num_tokens": 715764.0,
|
|
"reward": -0.1590951681137085,
|
|
"reward_std": 0.13472223281860352,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07962962985038757,
|
|
"rewards/belief_accuracy/std": 0.04074074327945709,
|
|
"rewards/env_reward/mean": 0.03236249089241028,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 289
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 24.25,
|
|
"completions/mean_terminated_length": 21.666667938232422,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.145,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.187851905822754,
|
|
"kl": 1.007147029042244,
|
|
"learning_rate": 2.3444444444444448e-05,
|
|
"loss": 0.0403,
|
|
"num_tokens": 718261.0,
|
|
"reward": -0.09939317405223846,
|
|
"reward_std": 0.23847907781600952,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1170712262392044,
|
|
"rewards/env_reward/std": 0.15898606181144714,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 9.75,
|
|
"completions/mean_terminated_length": 9.75,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1455,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5976037979125977,
|
|
"kl": 1.7104326635599136,
|
|
"learning_rate": 2.3333333333333336e-05,
|
|
"loss": 0.0684,
|
|
"num_tokens": 720700.0,
|
|
"reward": 1.4686640501022339,
|
|
"reward_std": 0.1929871290922165,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.162442684173584,
|
|
"rewards/env_reward/std": 0.12865811586380005,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 17.25,
|
|
"completions/mean_terminated_length": 12.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.146,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.35819676518440247,
|
|
"kl": 1.510893777012825,
|
|
"learning_rate": 2.3222222222222224e-05,
|
|
"loss": 0.0604,
|
|
"num_tokens": 723169.0,
|
|
"reward": -0.17388460040092468,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.06741027534008026,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1465,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.2439751625061035,
|
|
"kl": 0.9592940956354141,
|
|
"learning_rate": 2.3111111111111112e-05,
|
|
"loss": 0.0384,
|
|
"num_tokens": 725667.0,
|
|
"reward": 1.141730546951294,
|
|
"reward_std": 0.49770018458366394,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9444870352745056,
|
|
"rewards/env_reward/std": 0.3318001627922058,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.147,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.11825627088546753,
|
|
"kl": 1.7975642383098602,
|
|
"learning_rate": 2.3000000000000003e-05,
|
|
"loss": 0.0719,
|
|
"num_tokens": 728137.0,
|
|
"reward": 0.03742995858192444,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.20828664302825928,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1475,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.948641777038574,
|
|
"kl": 1.2299351058900356,
|
|
"learning_rate": 2.288888888888889e-05,
|
|
"loss": 0.0492,
|
|
"num_tokens": 730600.0,
|
|
"reward": 0.7943815588951111,
|
|
"reward_std": 0.04808274284005165,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7129210233688354,
|
|
"rewards/env_reward/std": 0.03205517679452896,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 30.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 30.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 14.25,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.148,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.8954527378082275,
|
|
"kl": 1.4944463968276978,
|
|
"learning_rate": 2.277777777777778e-05,
|
|
"loss": 0.0598,
|
|
"num_tokens": 733057.0,
|
|
"reward": -1.1060881614685059,
|
|
"reward_std": 2.5627288818359375,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5248920917510986,
|
|
"rewards/env_reward/std": 1.6501553058624268,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 296
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 10.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1485,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3740248680114746,
|
|
"kl": 1.4237545728683472,
|
|
"learning_rate": 2.2666666666666668e-05,
|
|
"loss": 0.057,
|
|
"num_tokens": 735520.0,
|
|
"reward": 1.2797008752822876,
|
|
"reward_std": 0.22939470410346985,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.0364673137664795,
|
|
"rewards/env_reward/std": 0.15292981266975403,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 20.666667938232422,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.149,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.193483829498291,
|
|
"kl": 1.3601939976215363,
|
|
"learning_rate": 2.255555555555556e-05,
|
|
"loss": 0.0544,
|
|
"num_tokens": 738014.0,
|
|
"reward": 0.5479322671890259,
|
|
"reward_std": 0.025000015273690224,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.577788233757019,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 298
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1495,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.508397102355957,
|
|
"kl": 2.330005407333374,
|
|
"learning_rate": 2.2444444444444447e-05,
|
|
"loss": 0.0932,
|
|
"num_tokens": 740476.0,
|
|
"reward": 0.6145496368408203,
|
|
"reward_std": 0.276214063167572,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5930330753326416,
|
|
"rewards/env_reward/std": 0.18414270877838135,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.15,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.23656092584133148,
|
|
"kl": 1.6847761273384094,
|
|
"learning_rate": 2.2333333333333335e-05,
|
|
"loss": 0.0674,
|
|
"num_tokens": 742965.0,
|
|
"reward": -0.4485160708427429,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.11567738652229309,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 10.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 8.5,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1505,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.4569952487945557,
|
|
"kl": 2.1653302907943726,
|
|
"learning_rate": 2.2222222222222223e-05,
|
|
"loss": 0.0866,
|
|
"num_tokens": 745399.0,
|
|
"reward": 0.7054736614227295,
|
|
"reward_std": 0.29305312037467957,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.29629629850387573,
|
|
"rewards/belief_accuracy/std": 0.03703702986240387,
|
|
"rewards/env_reward/mean": 1.0295751094818115,
|
|
"rewards/env_reward/std": 0.2694428265094757,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.151,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.0652999877929688,
|
|
"kl": 2.023993544280529,
|
|
"learning_rate": 2.211111111111111e-05,
|
|
"loss": 0.081,
|
|
"num_tokens": 747881.0,
|
|
"reward": 0.03393635153770447,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2059575766324997,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.1515,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.17844082415103912,
|
|
"kl": 1.3347703516483307,
|
|
"learning_rate": 2.2000000000000003e-05,
|
|
"loss": 0.0534,
|
|
"num_tokens": 750363.0,
|
|
"reward": -0.07117094099521637,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1358860433101654,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.152,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.173818111419678,
|
|
"kl": 2.3482457399368286,
|
|
"learning_rate": 2.188888888888889e-05,
|
|
"loss": 0.0939,
|
|
"num_tokens": 752819.0,
|
|
"reward": 0.19658201932907104,
|
|
"reward_std": 0.014906898140907288,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3143880367279053,
|
|
"rewards/env_reward/std": 0.009937942028045654,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.1525,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.2245287895202637,
|
|
"kl": 0.9445049874484539,
|
|
"learning_rate": 2.177777777777778e-05,
|
|
"loss": 0.0378,
|
|
"num_tokens": 755308.0,
|
|
"reward": -0.8592178821563721,
|
|
"reward_std": 2.7382078170776367,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.13583332300186157,
|
|
"rewards/belief_accuracy/std": 0.059961408376693726,
|
|
"rewards/env_reward/mean": -0.3011452257633209,
|
|
"rewards/env_reward/std": 1.803206205368042,
|
|
"rewards/format_valid/mean": 0.25,
|
|
"rewards/format_valid/std": 1.5,
|
|
"step": 305
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.153,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.128830686211586,
|
|
"kl": 1.6012963205575943,
|
|
"learning_rate": 2.1666666666666667e-05,
|
|
"loss": 0.0641,
|
|
"num_tokens": 757778.0,
|
|
"reward": 2.284616231918335,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.7064106464385986,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 18.666667938232422,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1535,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.645291805267334,
|
|
"kl": 1.772395834326744,
|
|
"learning_rate": 2.1555555555555555e-05,
|
|
"loss": 0.0709,
|
|
"num_tokens": 760266.0,
|
|
"reward": 0.8521826267242432,
|
|
"reward_std": 0.31403204798698425,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.04974466934800148,
|
|
"rewards/belief_accuracy/std": 0.10051066428422928,
|
|
"rewards/env_reward/mean": 0.6467777490615845,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 307
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 13.25,
|
|
"completions/mean_terminated_length": 13.25,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.154,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.47690078616142273,
|
|
"kl": 2.9113578498363495,
|
|
"learning_rate": 2.1444444444444443e-05,
|
|
"loss": 0.1165,
|
|
"num_tokens": 762719.0,
|
|
"reward": 1.4255595207214355,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.1337064504623413,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 21.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1545,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.587830066680908,
|
|
"kl": 2.2061780989170074,
|
|
"learning_rate": 2.1333333333333335e-05,
|
|
"loss": 0.0882,
|
|
"num_tokens": 765179.0,
|
|
"reward": -0.2726000249385834,
|
|
"reward_std": 0.008999993093311787,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.001600000075995922,
|
|
"rewards/env_reward/std": 0.005999999586492777,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.155,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.092574119567871,
|
|
"kl": 1.9001564681529999,
|
|
"learning_rate": 2.1222222222222223e-05,
|
|
"loss": 0.076,
|
|
"num_tokens": 767642.0,
|
|
"reward": 1.1732711791992188,
|
|
"reward_std": 0.32429030537605286,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.04803495109081268,
|
|
"rewards/belief_accuracy/std": 0.10393010079860687,
|
|
"rewards/env_reward/mean": 0.8574174642562866,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 310
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1555,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.12916934490203857,
|
|
"kl": 1.6546337455511093,
|
|
"learning_rate": 2.111111111111111e-05,
|
|
"loss": 0.0662,
|
|
"num_tokens": 770116.0,
|
|
"reward": 1.206794023513794,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9878627061843872,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.156,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9729405641555786,
|
|
"kl": 1.226298600435257,
|
|
"learning_rate": 2.1e-05,
|
|
"loss": 0.0491,
|
|
"num_tokens": 772603.0,
|
|
"reward": -0.8138376474380493,
|
|
"reward_std": 2.757441520690918,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.3300584852695465,
|
|
"rewards/env_reward/std": 1.7799609899520874,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 312
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1565,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1958553791046143,
|
|
"kl": 1.9549526572227478,
|
|
"learning_rate": 2.088888888888889e-05,
|
|
"loss": 0.0782,
|
|
"num_tokens": 775051.0,
|
|
"reward": -0.8436893820762634,
|
|
"reward_std": 0.03750000521540642,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10833333432674408,
|
|
"rewards/belief_accuracy/std": 0.01666666939854622,
|
|
"rewards/env_reward/mean": -0.36662623286247253,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 313
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 20.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.157,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.083063125610352,
|
|
"kl": 1.7084714621305466,
|
|
"learning_rate": 2.077777777777778e-05,
|
|
"loss": 0.0683,
|
|
"num_tokens": 777519.0,
|
|
"reward": -0.9321072697639465,
|
|
"reward_std": 2.689814329147339,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.0886182188987732,
|
|
"rewards/belief_accuracy/std": 0.022763576358556747,
|
|
"rewards/env_reward/mean": -0.4358351230621338,
|
|
"rewards/env_reward/std": 1.7134547233581543,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 314
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 22.25,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.1575,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.9729490280151367,
|
|
"kl": 1.67718306183815,
|
|
"learning_rate": 2.0666666666666666e-05,
|
|
"loss": 0.0671,
|
|
"num_tokens": 780008.0,
|
|
"reward": 0.4863058924674988,
|
|
"reward_std": 0.024692803621292114,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.09369880706071854,
|
|
"rewards/belief_accuracy/std": 0.012602388858795166,
|
|
"rewards/env_reward/mean": 0.4907682240009308,
|
|
"rewards/env_reward/std": 0.050000011920928955,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 315
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 9.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.158,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2093023657798767,
|
|
"kl": 1.742059737443924,
|
|
"learning_rate": 2.0555555555555555e-05,
|
|
"loss": 0.0697,
|
|
"num_tokens": 782468.0,
|
|
"reward": 0.46483826637268066,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4932255446910858,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1585,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.33806037902832,
|
|
"kl": 1.9989095479249954,
|
|
"learning_rate": 2.0444444444444446e-05,
|
|
"loss": 0.08,
|
|
"num_tokens": 784932.0,
|
|
"reward": 1.7721874713897705,
|
|
"reward_std": 0.821945071220398,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08935601264238358,
|
|
"rewards/belief_accuracy/std": 0.021287977695465088,
|
|
"rewards/env_reward/mean": 1.3393369913101196,
|
|
"rewards/env_reward/std": 0.5330812931060791,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 317
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 23.0,
|
|
"completions/mean_terminated_length": 23.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.159,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.552093505859375,
|
|
"kl": 1.7921818047761917,
|
|
"learning_rate": 2.0333333333333334e-05,
|
|
"loss": 0.0717,
|
|
"num_tokens": 787424.0,
|
|
"reward": 0.2771155834197998,
|
|
"reward_std": 0.052816081792116165,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3680770993232727,
|
|
"rewards/env_reward/std": 0.03521072119474411,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 20.25,
|
|
"completions/mean_terminated_length": 20.25,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.1595,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.766949653625488,
|
|
"kl": 1.8316420912742615,
|
|
"learning_rate": 2.0222222222222222e-05,
|
|
"loss": 0.0733,
|
|
"num_tokens": 789905.0,
|
|
"reward": 0.7448418140411377,
|
|
"reward_std": 0.2648809254169464,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.125,
|
|
"rewards/belief_accuracy/std": 0.05000000074505806,
|
|
"rewards/env_reward/mean": 0.725727915763855,
|
|
"rewards/env_reward/std": 0.21821647882461548,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 319
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.16,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.683352470397949,
|
|
"kl": 1.9505991637706757,
|
|
"learning_rate": 2.011111111111111e-05,
|
|
"loss": 0.078,
|
|
"num_tokens": 792357.0,
|
|
"reward": 2.3521366119384766,
|
|
"reward_std": 0.05138897895812988,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0981481522321701,
|
|
"rewards/belief_accuracy/std": 0.0037037059664726257,
|
|
"rewards/env_reward/mean": 1.7435539960861206,
|
|
"rewards/env_reward/std": 0.050000011920928955,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 320
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1605,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7608256340026855,
|
|
"kl": 1.294388547539711,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0518,
|
|
"num_tokens": 794851.0,
|
|
"reward": -1.1710084676742554,
|
|
"reward_std": 2.5193276405334473,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5681723356246948,
|
|
"rewards/env_reward/std": 1.6212184429168701,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 321
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.161,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.618898391723633,
|
|
"kl": 1.0062730349600315,
|
|
"learning_rate": 1.988888888888889e-05,
|
|
"loss": 0.0403,
|
|
"num_tokens": 797334.0,
|
|
"reward": -1.3532192707061768,
|
|
"reward_std": 2.3978536128997803,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6896461844444275,
|
|
"rewards/env_reward/std": 1.5402358770370483,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 322
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 12.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1615,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3661885261535645,
|
|
"kl": 1.7326427102088928,
|
|
"learning_rate": 1.9777777777777778e-05,
|
|
"loss": 0.0693,
|
|
"num_tokens": 799804.0,
|
|
"reward": -0.9932671785354614,
|
|
"reward_std": 2.645886182785034,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.09086865186691284,
|
|
"rewards/belief_accuracy/std": 0.018262699246406555,
|
|
"rewards/env_reward/mean": -0.47210749983787537,
|
|
"rewards/env_reward/std": 1.6948373317718506,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 323
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 26.75,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.162,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.10320170223712921,
|
|
"kl": 1.0313833132386208,
|
|
"learning_rate": 1.9666666666666666e-05,
|
|
"loss": 0.0413,
|
|
"num_tokens": 802311.0,
|
|
"reward": 0.14523997902870178,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.28015998005867004,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.900913953781128,
|
|
"kl": 1.2387384474277496,
|
|
"learning_rate": 1.9555555555555557e-05,
|
|
"loss": 0.0495,
|
|
"num_tokens": 804799.0,
|
|
"reward": -1.4717153310775757,
|
|
"reward_std": 2.3200571537017822,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08888889104127884,
|
|
"rewards/belief_accuracy/std": 0.02222222276031971,
|
|
"rewards/env_reward/mean": -0.7950325012207031,
|
|
"rewards/env_reward/std": 1.4699784517288208,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 325
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 19.25,
|
|
"completions/mean_terminated_length": 19.25,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.163,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.6629624366760254,
|
|
"kl": 1.2183350324630737,
|
|
"learning_rate": 1.9444444444444445e-05,
|
|
"loss": 0.0487,
|
|
"num_tokens": 807276.0,
|
|
"reward": 1.4414076805114746,
|
|
"reward_std": 0.7849681973457336,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.03274526074528694,
|
|
"rewards/belief_accuracy/std": 0.046666666865348816,
|
|
"rewards/env_reward/mean": 0.8621145486831665,
|
|
"rewards/env_reward/std": 0.5469719767570496,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 28.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.1635,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7185583114624023,
|
|
"kl": 0.9632565826177597,
|
|
"learning_rate": 1.9333333333333333e-05,
|
|
"loss": 0.0385,
|
|
"num_tokens": 809790.0,
|
|
"reward": 0.584887683391571,
|
|
"reward_std": 0.04735124111175537,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11294145882129669,
|
|
"rewards/belief_accuracy/std": 0.025882910937070847,
|
|
"rewards/env_reward/mean": 0.5949747562408447,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 327
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 20.0,
|
|
"completions/mean_terminated_length": 20.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.164,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.11391498893499374,
|
|
"kl": 1.3412380516529083,
|
|
"learning_rate": 1.922222222222222e-05,
|
|
"loss": 0.0536,
|
|
"num_tokens": 812270.0,
|
|
"reward": 0.08814990520477295,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.24209994077682495,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1645,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.71048104763031,
|
|
"kl": 1.0797484368085861,
|
|
"learning_rate": 1.9111111111111113e-05,
|
|
"loss": 0.0432,
|
|
"num_tokens": 814736.0,
|
|
"reward": -0.007705964148044586,
|
|
"reward_std": 0.3800565302371979,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1781960427761078,
|
|
"rewards/env_reward/std": 0.2533710300922394,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.165,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2313052862882614,
|
|
"kl": 2.1413558423519135,
|
|
"learning_rate": 1.9e-05,
|
|
"loss": 0.0857,
|
|
"num_tokens": 817183.0,
|
|
"reward": 1.2785534858703613,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.0357023477554321,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1655,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.0023722648620605,
|
|
"kl": 1.8249645978212357,
|
|
"learning_rate": 1.888888888888889e-05,
|
|
"loss": 0.073,
|
|
"num_tokens": 819662.0,
|
|
"reward": 0.22990059852600098,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3366004228591919,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 19.25,
|
|
"completions/mean_terminated_length": 19.25,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.166,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8686375617980957,
|
|
"kl": 1.2005413547158241,
|
|
"learning_rate": 1.8777777777777777e-05,
|
|
"loss": 0.048,
|
|
"num_tokens": 822139.0,
|
|
"reward": 0.4151918888092041,
|
|
"reward_std": 0.25368455052375793,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.19324073195457458,
|
|
"rewards/belief_accuracy/std": 0.08456152677536011,
|
|
"rewards/env_reward/mean": 0.6299427151679993,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 31.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 31.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 22.25,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1665,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5046756267547607,
|
|
"kl": 1.418169043958187,
|
|
"learning_rate": 1.866666666666667e-05,
|
|
"loss": 0.0567,
|
|
"num_tokens": 824628.0,
|
|
"reward": -0.14963069558143616,
|
|
"reward_std": 0.08749999105930328,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.11274620145559311,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 333
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 14.5,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.167,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.3764250874519348,
|
|
"kl": 1.4826988205313683,
|
|
"learning_rate": 1.8555555555555557e-05,
|
|
"loss": 0.0593,
|
|
"num_tokens": 827086.0,
|
|
"reward": 0.7749629616737366,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6999753713607788,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1675,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6749558448791504,
|
|
"kl": 0.48872488737106323,
|
|
"learning_rate": 1.8444444444444445e-05,
|
|
"loss": 0.0195,
|
|
"num_tokens": 829294.0,
|
|
"reward": 0.8840881586074829,
|
|
"reward_std": 0.5165320634841919,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5560587644577026,
|
|
"rewards/env_reward/std": 0.3443547189235687,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.168,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3814547061920166,
|
|
"kl": 1.7959783673286438,
|
|
"learning_rate": 1.8333333333333333e-05,
|
|
"loss": 0.0718,
|
|
"num_tokens": 831767.0,
|
|
"reward": 1.6337858438491821,
|
|
"reward_std": 0.598229169845581,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.2725238800048828,
|
|
"rewards/env_reward/std": 0.39881935715675354,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 20.33333396911621,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1685,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.819786548614502,
|
|
"kl": 1.5642708837985992,
|
|
"learning_rate": 1.8222222222222224e-05,
|
|
"loss": 0.0626,
|
|
"num_tokens": 834260.0,
|
|
"reward": 0.8285795450210571,
|
|
"reward_std": 0.7248117923736572,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7357197403907776,
|
|
"rewards/env_reward/std": 0.48320794105529785,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.169,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.874178647994995,
|
|
"kl": 1.9785820245742798,
|
|
"learning_rate": 1.8111111111111112e-05,
|
|
"loss": 0.0791,
|
|
"num_tokens": 836726.0,
|
|
"reward": -1.273527979850769,
|
|
"reward_std": 2.451591968536377,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.09240995347499847,
|
|
"rewards/belief_accuracy/std": 0.015180099755525589,
|
|
"rewards/env_reward/mean": -0.6558654308319092,
|
|
"rewards/env_reward/std": 1.5627564191818237,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 338
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.5,
|
|
"completions/mean_terminated_length": 8.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1695,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.4243091344833374,
|
|
"kl": 2.757372349500656,
|
|
"learning_rate": 1.8e-05,
|
|
"loss": 0.1103,
|
|
"num_tokens": 839160.0,
|
|
"reward": -0.0059850215911865234,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1793433278799057,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 30.0,
|
|
"completions/mean_terminated_length": 24.0,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 24.0,
|
|
"epoch": 0.17,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.5190337896347046,
|
|
"kl": 1.0218081027269363,
|
|
"learning_rate": 1.788888888888889e-05,
|
|
"loss": 0.0409,
|
|
"num_tokens": 841680.0,
|
|
"reward": 0.3096563220024109,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3897709250450134,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 20.33333396911621,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1705,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.4040679931640625,
|
|
"kl": 1.460461974143982,
|
|
"learning_rate": 1.777777777777778e-05,
|
|
"loss": 0.0584,
|
|
"num_tokens": 844173.0,
|
|
"reward": 0.343553364276886,
|
|
"reward_std": 0.12038552761077881,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11296296119689941,
|
|
"rewards/belief_accuracy/std": 0.07037036865949631,
|
|
"rewards/env_reward/mean": 0.4257948398590088,
|
|
"rewards/env_reward/std": 0.08660253137350082,
|
|
"rewards/format_valid/mean": 0.875,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 341
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.171,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.3424733579158783,
|
|
"kl": 2.612625613808632,
|
|
"learning_rate": 1.7666666666666668e-05,
|
|
"loss": 0.1045,
|
|
"num_tokens": 846605.0,
|
|
"reward": 0.7122496962547302,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6581664681434631,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.1715,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.17530632019043,
|
|
"kl": 1.9749469459056854,
|
|
"learning_rate": 1.7555555555555556e-05,
|
|
"loss": 0.079,
|
|
"num_tokens": 849092.0,
|
|
"reward": 0.6472027897834778,
|
|
"reward_std": 0.02499997615814209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.6439685821533203,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 343
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 22.75,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.172,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9005358219146729,
|
|
"kl": 1.6942770928144455,
|
|
"learning_rate": 1.7444444444444448e-05,
|
|
"loss": 0.0678,
|
|
"num_tokens": 851583.0,
|
|
"reward": -0.1503353714942932,
|
|
"reward_std": 3.1997761726379395,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.11227646470069885,
|
|
"rewards/env_reward/std": 2.0748510360717773,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 344
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 14.25,
|
|
"completions/mean_terminated_length": 14.25,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1725,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6135246753692627,
|
|
"kl": 2.6989140063524246,
|
|
"learning_rate": 1.7333333333333336e-05,
|
|
"loss": 0.108,
|
|
"num_tokens": 854040.0,
|
|
"reward": 0.3334026038646698,
|
|
"reward_std": 0.41579148173332214,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08210506290197372,
|
|
"rewards/belief_accuracy/std": 0.08616961538791656,
|
|
"rewards/env_reward/mean": 0.3614785075187683,
|
|
"rewards/env_reward/std": 0.10307764261960983,
|
|
"rewards/format_valid/mean": 0.75,
|
|
"rewards/format_valid/std": 0.28867512941360474,
|
|
"step": 345
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 12.5,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.173,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.07000944018363953,
|
|
"kl": 1.5389113873243332,
|
|
"learning_rate": 1.7222222222222224e-05,
|
|
"loss": 0.0616,
|
|
"num_tokens": 856490.0,
|
|
"reward": -0.5777875781059265,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.20185838639736176,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 19.666667938232422,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1735,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1494380682706833,
|
|
"kl": 1.0276093482971191,
|
|
"learning_rate": 1.7111111111111112e-05,
|
|
"loss": 0.0411,
|
|
"num_tokens": 858981.0,
|
|
"reward": -0.03337675333023071,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1610821634531021,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.174,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 3.6167244911193848,
|
|
"kl": 2.8303582668304443,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 0.1132,
|
|
"num_tokens": 861433.0,
|
|
"reward": -0.3828308582305908,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.07188723236322403,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1745,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7456555366516113,
|
|
"kl": 1.0419511049985886,
|
|
"learning_rate": 1.688888888888889e-05,
|
|
"loss": 0.0417,
|
|
"num_tokens": 863906.0,
|
|
"reward": -1.5166230201721191,
|
|
"reward_std": 2.289083957672119,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.7985820770263672,
|
|
"rewards/env_reward/std": 1.4677271842956543,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 349
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.4072089195251465,
|
|
"kl": 1.2833919078111649,
|
|
"learning_rate": 1.677777777777778e-05,
|
|
"loss": 0.0513,
|
|
"num_tokens": 866366.0,
|
|
"reward": -0.9303372502326965,
|
|
"reward_std": 2.7289562225341797,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.40772485733032227,
|
|
"rewards/env_reward/std": 1.762056589126587,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 350
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1755,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.789770603179932,
|
|
"kl": 2.8316257670521736,
|
|
"learning_rate": 1.6666666666666667e-05,
|
|
"loss": 0.1133,
|
|
"num_tokens": 868810.0,
|
|
"reward": -1.0804004669189453,
|
|
"reward_std": 2.5841238498687744,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5077669620513916,
|
|
"rewards/env_reward/std": 1.6645185947418213,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 351
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 21.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 12.75,
|
|
"completions/mean_terminated_length": 12.75,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.176,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.470717430114746,
|
|
"kl": 1.4938061088323593,
|
|
"learning_rate": 1.655555555555556e-05,
|
|
"loss": 0.0598,
|
|
"num_tokens": 871261.0,
|
|
"reward": 0.675422191619873,
|
|
"reward_std": 0.3987497091293335,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6336148381233215,
|
|
"rewards/env_reward/std": 0.26583316922187805,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.1765,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.12403933703899384,
|
|
"kl": 1.361063152551651,
|
|
"learning_rate": 1.6444444444444447e-05,
|
|
"loss": 0.0544,
|
|
"num_tokens": 873721.0,
|
|
"reward": 0.8822908997535706,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7715272903442383,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.177,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7063394784927368,
|
|
"kl": 0.3987312912940979,
|
|
"learning_rate": 1.6333333333333335e-05,
|
|
"loss": 0.0159,
|
|
"num_tokens": 876153.0,
|
|
"reward": 0.5504003167152405,
|
|
"reward_std": 0.21252882480621338,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.09899382293224335,
|
|
"rewards/belief_accuracy/std": 0.03703703358769417,
|
|
"rewards/env_reward/mean": 0.5315878391265869,
|
|
"rewards/env_reward/std": 0.06761179864406586,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 10.666666984558105,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1775,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.8553385734558105,
|
|
"kl": 1.883504644036293,
|
|
"learning_rate": 1.6222222222222223e-05,
|
|
"loss": 0.0753,
|
|
"num_tokens": 878617.0,
|
|
"reward": 1.5724425315856934,
|
|
"reward_std": 0.34154796600341797,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.06390867382287979,
|
|
"rewards/belief_accuracy/std": 0.07218265533447266,
|
|
"rewards/env_reward/mean": 1.1552791595458984,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 355
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 18.0,
|
|
"completions/mean_terminated_length": 13.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.178,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 12.448164939880371,
|
|
"kl": 2.9188559651374817,
|
|
"learning_rate": 1.6111111111111115e-05,
|
|
"loss": 0.1168,
|
|
"num_tokens": 881089.0,
|
|
"reward": 0.1419695019721985,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.27797967195510864,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 20.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1785,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.100508213043213,
|
|
"kl": 1.9373711496591568,
|
|
"learning_rate": 1.6000000000000003e-05,
|
|
"loss": 0.0775,
|
|
"num_tokens": 883555.0,
|
|
"reward": 0.9330868721008301,
|
|
"reward_std": 0.1425846964120865,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07523864507675171,
|
|
"rewards/belief_accuracy/std": 0.04427932947874069,
|
|
"rewards/env_reward/mean": 0.747535228729248,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.75,
|
|
"rewards/format_valid/std": 0.28867512941360474,
|
|
"step": 357
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 18.666667938232422,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.179,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.447638750076294,
|
|
"kl": 1.3387613743543625,
|
|
"learning_rate": 1.588888888888889e-05,
|
|
"loss": 0.0536,
|
|
"num_tokens": 886043.0,
|
|
"reward": -0.9939883947372437,
|
|
"reward_std": 2.637341022491455,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.450158953666687,
|
|
"rewards/env_reward/std": 1.6998939514160156,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 358
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1795,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1080024242401123,
|
|
"kl": 1.570095743983984,
|
|
"learning_rate": 1.577777777777778e-05,
|
|
"loss": 0.0628,
|
|
"num_tokens": 888503.0,
|
|
"reward": 0.31804150342941284,
|
|
"reward_std": 0.24924513697624207,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07387004047632217,
|
|
"rewards/belief_accuracy/std": 0.05280756205320358,
|
|
"rewards/env_reward/mean": 0.3264344036579132,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.18,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.21039718389511108,
|
|
"kl": 1.6601714193820953,
|
|
"learning_rate": 1.5666666666666667e-05,
|
|
"loss": 0.0664,
|
|
"num_tokens": 890936.0,
|
|
"reward": 1.1850183010101318,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9733456373214722,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1805,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.855799674987793,
|
|
"kl": 2.307783365249634,
|
|
"learning_rate": 1.5555555555555555e-05,
|
|
"loss": 0.0923,
|
|
"num_tokens": 893415.0,
|
|
"reward": -0.09686481952667236,
|
|
"reward_std": 3.240438222885132,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10740740597248077,
|
|
"rewards/belief_accuracy/std": 0.014814812690019608,
|
|
"rewards/env_reward/mean": 0.1585715115070343,
|
|
"rewards/env_reward/std": 2.109255075454712,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 361
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.181,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.866522789001465,
|
|
"kl": 1.5097930580377579,
|
|
"learning_rate": 1.5444444444444446e-05,
|
|
"loss": 0.0604,
|
|
"num_tokens": 895624.0,
|
|
"reward": 0.7403470277786255,
|
|
"reward_std": 0.04096466302871704,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.46023130416870117,
|
|
"rewards/env_reward/std": 0.027309775352478027,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.1815,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6737067699432373,
|
|
"kl": 2.508372038602829,
|
|
"learning_rate": 1.5333333333333334e-05,
|
|
"loss": 0.1003,
|
|
"num_tokens": 898087.0,
|
|
"reward": 0.41143205761909485,
|
|
"reward_std": 0.1285679042339325,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0962962955236435,
|
|
"rewards/belief_accuracy/std": 0.06063224375247955,
|
|
"rewards/env_reward/mean": 0.44188064336776733,
|
|
"rewards/env_reward/std": 0.07500001788139343,
|
|
"rewards/format_valid/mean": 0.75,
|
|
"rewards/format_valid/std": 0.28867512941360474,
|
|
"step": 363
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 26.75,
|
|
"completions/mean_terminated_length": 21.5,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.182,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.876965284347534,
|
|
"kl": 1.120678260922432,
|
|
"learning_rate": 1.5222222222222224e-05,
|
|
"loss": 0.0448,
|
|
"num_tokens": 900594.0,
|
|
"reward": -0.9049590826034546,
|
|
"reward_std": 2.696809768676758,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10109934210777283,
|
|
"rewards/belief_accuracy/std": 0.002198692411184311,
|
|
"rewards/env_reward/mean": -0.3927740752696991,
|
|
"rewards/env_reward/std": 1.7381988763809204,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 364
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1825,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1522875726222992,
|
|
"kl": 2.180483788251877,
|
|
"learning_rate": 1.5111111111111112e-05,
|
|
"loss": 0.0872,
|
|
"num_tokens": 903041.0,
|
|
"reward": -0.1322389543056488,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.09517402946949005,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 26.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 12.5,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.183,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.5857621431350708,
|
|
"kl": 1.8570825457572937,
|
|
"learning_rate": 1.5e-05,
|
|
"loss": 0.0743,
|
|
"num_tokens": 905267.0,
|
|
"reward": 0.7447291612625122,
|
|
"reward_std": 0.43247494101524353,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.46315279603004456,
|
|
"rewards/env_reward/std": 0.28831663727760315,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 20.666667938232422,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.1835,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.6089088916778564,
|
|
"kl": 1.4628183841705322,
|
|
"learning_rate": 1.4888888888888888e-05,
|
|
"loss": 0.0585,
|
|
"num_tokens": 907761.0,
|
|
"reward": -0.8686517477035522,
|
|
"reward_std": 2.723708152770996,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.36660122871398926,
|
|
"rewards/env_reward/std": 1.757534384727478,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 367
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.184,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5371828079223633,
|
|
"kl": 1.56082084774971,
|
|
"learning_rate": 1.477777777777778e-05,
|
|
"loss": 0.0624,
|
|
"num_tokens": 910251.0,
|
|
"reward": -0.38402676582336426,
|
|
"reward_std": 0.11692270636558533,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.07268451154232025,
|
|
"rewards/env_reward/std": 0.07794848084449768,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1845,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.959965944290161,
|
|
"kl": 2.1547632068395615,
|
|
"learning_rate": 1.4666666666666668e-05,
|
|
"loss": 0.0862,
|
|
"num_tokens": 912740.0,
|
|
"reward": 2.330038070678711,
|
|
"reward_std": 1.4076868295669556,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.736691951751709,
|
|
"rewards/env_reward/std": 0.9384578466415405,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 19.25,
|
|
"completions/mean_terminated_length": 19.25,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.185,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.676491737365723,
|
|
"kl": 1.9522491097450256,
|
|
"learning_rate": 1.4555555555555556e-05,
|
|
"loss": 0.0781,
|
|
"num_tokens": 915217.0,
|
|
"reward": 0.3270930051803589,
|
|
"reward_std": 0.4210644066333771,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1784050613641739,
|
|
"rewards/belief_accuracy/std": 0.11000154912471771,
|
|
"rewards/env_reward/mean": 0.5498721599578857,
|
|
"rewards/env_reward/std": 0.2555084824562073,
|
|
"rewards/format_valid/mean": 0.75,
|
|
"rewards/format_valid/std": 0.28867512941360474,
|
|
"step": 370
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 24.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 15.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1855,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.2627854347229004,
|
|
"kl": 1.2735976241528988,
|
|
"learning_rate": 1.4444444444444444e-05,
|
|
"loss": 0.0509,
|
|
"num_tokens": 917679.0,
|
|
"reward": -1.3109471797943115,
|
|
"reward_std": 2.4330153465270996,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6614648103713989,
|
|
"rewards/env_reward/std": 1.5638506412506104,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 371
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.186,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.16428321599960327,
|
|
"kl": 1.3368073627352715,
|
|
"learning_rate": 1.4333333333333334e-05,
|
|
"loss": 0.0535,
|
|
"num_tokens": 920166.0,
|
|
"reward": 1.3904471397399902,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.1102981567382812,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1865,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8244247436523438,
|
|
"kl": 1.7505681961774826,
|
|
"learning_rate": 1.4222222222222224e-05,
|
|
"loss": 0.07,
|
|
"num_tokens": 922599.0,
|
|
"reward": -0.15878620743751526,
|
|
"reward_std": 0.2896662950515747,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.21296297013759613,
|
|
"rewards/belief_accuracy/std": 0.055555559694767,
|
|
"rewards/env_reward/mean": 0.2867351174354553,
|
|
"rewards/env_reward/std": 0.0973709374666214,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 7.0,
|
|
"completions/mean_length": 13.25,
|
|
"completions/mean_terminated_length": 7.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.187,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4690308570861816,
|
|
"kl": 1.8018342107534409,
|
|
"learning_rate": 1.4111111111111112e-05,
|
|
"loss": 0.0721,
|
|
"num_tokens": 925052.0,
|
|
"reward": -0.061834536492824554,
|
|
"reward_std": 0.3926420509815216,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.14211031794548035,
|
|
"rewards/env_reward/std": 0.2617613673210144,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.25,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1875,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.06469111144542694,
|
|
"kl": 1.3878820985555649,
|
|
"learning_rate": 1.4000000000000001e-05,
|
|
"loss": 0.0555,
|
|
"num_tokens": 927529.0,
|
|
"reward": 1.96260666847229,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.4917376041412354,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 11.75,
|
|
"completions/mean_terminated_length": 11.75,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.188,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1889788955450058,
|
|
"kl": 2.4346917867660522,
|
|
"learning_rate": 1.388888888888889e-05,
|
|
"loss": 0.0974,
|
|
"num_tokens": 929976.0,
|
|
"reward": -0.43010014295578003,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.1034000813961029,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 14.666666984558105,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1885,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5201666355133057,
|
|
"kl": 1.3663449361920357,
|
|
"learning_rate": 1.3777777777777778e-05,
|
|
"loss": 0.0547,
|
|
"num_tokens": 932452.0,
|
|
"reward": 0.06843796372413635,
|
|
"reward_std": 0.21172499656677246,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2289586365222931,
|
|
"rewards/env_reward/std": 0.14114999771118164,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 30.5,
|
|
"completions/mean_terminated_length": 26.0,
|
|
"completions/min_length": 26.0,
|
|
"completions/min_terminated_length": 26.0,
|
|
"epoch": 0.189,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.14993655681610107,
|
|
"kl": 0.6987491399049759,
|
|
"learning_rate": 1.3666666666666666e-05,
|
|
"loss": 0.0279,
|
|
"num_tokens": 934974.0,
|
|
"reward": 1.5713868141174316,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.2309246063232422,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.1895,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.221292018890381,
|
|
"kl": 1.250510759651661,
|
|
"learning_rate": 1.3555555555555557e-05,
|
|
"loss": 0.05,
|
|
"num_tokens": 937456.0,
|
|
"reward": -1.592812180519104,
|
|
"reward_std": 2.260319471359253,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.8493747711181641,
|
|
"rewards/env_reward/std": 1.4491422176361084,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 379
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 17.25,
|
|
"completions/mean_terminated_length": 12.333333969116211,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.19,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.452324628829956,
|
|
"kl": 1.871617242693901,
|
|
"learning_rate": 1.3444444444444445e-05,
|
|
"loss": 0.0749,
|
|
"num_tokens": 939925.0,
|
|
"reward": -1.566220998764038,
|
|
"reward_std": 2.2558834552764893,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.1041666716337204,
|
|
"rewards/belief_accuracy/std": 0.008333333767950535,
|
|
"rewards/env_reward/mean": -0.8274806141853333,
|
|
"rewards/env_reward/std": 1.4483462572097778,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 380
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 19.666667938232422,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.1905,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3323750495910645,
|
|
"kl": 1.2848467752337456,
|
|
"learning_rate": 1.3333333333333333e-05,
|
|
"loss": 0.0514,
|
|
"num_tokens": 942416.0,
|
|
"reward": -0.6763423681259155,
|
|
"reward_std": 2.849104881286621,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.23839491605758667,
|
|
"rewards/env_reward/std": 1.8410701751708984,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 381
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.191,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2046873569488525,
|
|
"kl": 1.6044066846370697,
|
|
"learning_rate": 1.3222222222222221e-05,
|
|
"loss": 0.0642,
|
|
"num_tokens": 944899.0,
|
|
"reward": -0.8772720694541931,
|
|
"reward_std": 2.7223305702209473,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.3431813418865204,
|
|
"rewards/env_reward/std": 1.7810261249542236,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 382
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1915,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.325809955596924,
|
|
"kl": 1.2760074064135551,
|
|
"learning_rate": 1.3111111111111113e-05,
|
|
"loss": 0.051,
|
|
"num_tokens": 947378.0,
|
|
"reward": 0.27768200635910034,
|
|
"reward_std": 0.5863049030303955,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3684546947479248,
|
|
"rewards/env_reward/std": 0.39086994528770447,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 17.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 17.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.192,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 3.8383851051330566,
|
|
"kl": 2.990824520587921,
|
|
"learning_rate": 1.3000000000000001e-05,
|
|
"loss": 0.1196,
|
|
"num_tokens": 949820.0,
|
|
"reward": -0.10776805877685547,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1114879697561264,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 25.75,
|
|
"completions/mean_terminated_length": 19.5,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.1925,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.13448657095432281,
|
|
"kl": 1.0713577568531036,
|
|
"learning_rate": 1.2888888888888889e-05,
|
|
"loss": 0.0429,
|
|
"num_tokens": 952323.0,
|
|
"reward": 1.157989263534546,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9553261399269104,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.193,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.9450551271438599,
|
|
"kl": 0.9243371933698654,
|
|
"learning_rate": 1.2777777777777777e-05,
|
|
"loss": 0.037,
|
|
"num_tokens": 954816.0,
|
|
"reward": -0.5924662947654724,
|
|
"reward_std": 0.07494938373565674,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.2116442173719406,
|
|
"rewards/env_reward/std": 0.049966249614953995,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1935,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.15212330222129822,
|
|
"kl": 1.260749876499176,
|
|
"learning_rate": 1.2666666666666668e-05,
|
|
"loss": 0.0504,
|
|
"num_tokens": 957301.0,
|
|
"reward": 1.0907130241394043,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9104753136634827,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.194,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.1250762939453125,
|
|
"kl": 1.1979729011654854,
|
|
"learning_rate": 1.2555555555555557e-05,
|
|
"loss": 0.0479,
|
|
"num_tokens": 959769.0,
|
|
"reward": -1.0788286924362183,
|
|
"reward_std": 2.5807807445526123,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5067191123962402,
|
|
"rewards/env_reward/std": 1.6621873378753662,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 388
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 10.75,
|
|
"completions/mean_terminated_length": 10.75,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.1945,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 5.245925426483154,
|
|
"kl": 3.0835418105125427,
|
|
"learning_rate": 1.2444444444444445e-05,
|
|
"loss": 0.1233,
|
|
"num_tokens": 962212.0,
|
|
"reward": 0.7963976263999939,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7142651081085205,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.195,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.65029776096344,
|
|
"kl": 0.22941425442695618,
|
|
"learning_rate": 1.2333333333333334e-05,
|
|
"loss": 0.0092,
|
|
"num_tokens": 964644.0,
|
|
"reward": -0.39757806062698364,
|
|
"reward_std": 0.055555522441864014,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.28703704476356506,
|
|
"rewards/belief_accuracy/std": 0.018518507480621338,
|
|
"rewards/env_reward/mean": 0.27568870782852173,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1955,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8236050605773926,
|
|
"kl": 0.451984953135252,
|
|
"learning_rate": 1.2222222222222222e-05,
|
|
"loss": 0.0181,
|
|
"num_tokens": 967076.0,
|
|
"reward": 0.8589808344841003,
|
|
"reward_std": 0.166666641831398,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1035659909248352,
|
|
"rewards/belief_accuracy/std": 0.0555555522441864,
|
|
"rewards/env_reward/mean": 0.7464525699615479,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 22.75,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.196,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.8952887058258057,
|
|
"kl": 1.2005406767129898,
|
|
"learning_rate": 1.2111111111111112e-05,
|
|
"loss": 0.048,
|
|
"num_tokens": 969567.0,
|
|
"reward": -0.6726483106613159,
|
|
"reward_std": 2.8525443077087402,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.08888889104127884,
|
|
"rewards/belief_accuracy/std": 0.02222222276031971,
|
|
"rewards/env_reward/mean": -0.2623211145401001,
|
|
"rewards/env_reward/std": 1.8251193761825562,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 392
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.1965,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.469162702560425,
|
|
"kl": 1.197481319308281,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 0.0479,
|
|
"num_tokens": 972049.0,
|
|
"reward": 0.8975731134414673,
|
|
"reward_std": 0.051388900727033615,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.09351852536201477,
|
|
"rewards/belief_accuracy/std": 0.012962963432073593,
|
|
"rewards/env_reward/mean": 0.7645858526229858,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 393
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 20.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.197,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2335853576660156,
|
|
"kl": 1.0094499439001083,
|
|
"learning_rate": 1.188888888888889e-05,
|
|
"loss": 0.0404,
|
|
"num_tokens": 974519.0,
|
|
"reward": 1.5189111232757568,
|
|
"reward_std": 1.0298280715942383,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.07871220260858536,
|
|
"rewards/belief_accuracy/std": 0.022310344502329826,
|
|
"rewards/env_reward/mean": 0.8218495845794678,
|
|
"rewards/env_reward/std": 0.7057132720947266,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 17.666667938232422,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1975,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5887300968170166,
|
|
"kl": 1.6546301878988743,
|
|
"learning_rate": 1.1777777777777778e-05,
|
|
"loss": 0.0662,
|
|
"num_tokens": 977004.0,
|
|
"reward": 0.33133554458618164,
|
|
"reward_std": 3.520890235900879,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4333902597427368,
|
|
"rewards/env_reward/std": 2.288926839828491,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 395
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 9.0,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.198,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.219426155090332,
|
|
"kl": 1.7353740334510803,
|
|
"learning_rate": 1.1666666666666668e-05,
|
|
"loss": 0.0694,
|
|
"num_tokens": 979440.0,
|
|
"reward": 0.5393995642662048,
|
|
"reward_std": 0.5616854429244995,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1666666716337204,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6595997214317322,
|
|
"rewards/env_reward/std": 0.3744569718837738,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.1985,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.01091479230672121,
|
|
"kl": 0.2686502933502197,
|
|
"learning_rate": 1.1555555555555556e-05,
|
|
"loss": 0.0107,
|
|
"num_tokens": 981872.0,
|
|
"reward": 0.04923933744430542,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.2777777910232544,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5550484657287598,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 10.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.199,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.06738461554050446,
|
|
"kl": 1.1423609554767609,
|
|
"learning_rate": 1.1444444444444446e-05,
|
|
"loss": 0.0457,
|
|
"num_tokens": 984335.0,
|
|
"reward": 0.34038877487182617,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4102592170238495,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 19.5,
|
|
"completions/mean_terminated_length": 15.333333969116211,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1995,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.11339511722326279,
|
|
"kl": 1.6856607422232628,
|
|
"learning_rate": 1.1333333333333334e-05,
|
|
"loss": 0.0674,
|
|
"num_tokens": 986813.0,
|
|
"reward": 1.0687165260314941,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.8958110809326172,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 18.33333396911621,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.2,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5888490676879883,
|
|
"kl": 1.7492893785238266,
|
|
"learning_rate": 1.1222222222222224e-05,
|
|
"loss": 0.07,
|
|
"num_tokens": 989300.0,
|
|
"reward": -0.39356112480163574,
|
|
"reward_std": 0.022566793486475945,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08703704178333282,
|
|
"rewards/belief_accuracy/std": 0.021276157349348068,
|
|
"rewards/env_reward/mean": -0.11329999566078186,
|
|
"rewards/env_reward/std": 0.05000000074505806,
|
|
"rewards/format_valid/mean": 0.75,
|
|
"rewards/format_valid/std": 0.28867512941360474,
|
|
"step": 400
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 19.25,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.996265172958374,
|
|
"kl": 1.6013427376747131,
|
|
"learning_rate": 1.1111111111111112e-05,
|
|
"loss": 0.0641,
|
|
"num_tokens": 991777.0,
|
|
"reward": -0.2611404061317444,
|
|
"reward_std": 0.13472223281860352,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07962962985038757,
|
|
"rewards/belief_accuracy/std": 0.04074074327945709,
|
|
"rewards/env_reward/mean": -0.035667672753334045,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 401
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 24.25,
|
|
"completions/mean_terminated_length": 21.666667938232422,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.201,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6548614501953125,
|
|
"kl": 1.0493617877364159,
|
|
"learning_rate": 1.1000000000000001e-05,
|
|
"loss": 0.042,
|
|
"num_tokens": 994274.0,
|
|
"reward": -0.802269697189331,
|
|
"reward_std": 2.765153408050537,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.3223464787006378,
|
|
"rewards/env_reward/std": 1.785102367401123,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 402
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 27.75,
|
|
"completions/mean_terminated_length": 23.5,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.2015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.575714588165283,
|
|
"kl": 0.9777809828519821,
|
|
"learning_rate": 1.088888888888889e-05,
|
|
"loss": 0.0391,
|
|
"num_tokens": 996785.0,
|
|
"reward": 0.05824078619480133,
|
|
"reward_std": 0.025000005960464478,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 0.25132718682289124,
|
|
"rewards/env_reward/std": 0.07500001043081284,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 403
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 14.5,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.202,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.14137566089630127,
|
|
"kl": 1.6623368486762047,
|
|
"learning_rate": 1.0777777777777778e-05,
|
|
"loss": 0.0665,
|
|
"num_tokens": 999243.0,
|
|
"reward": 0.7642103433609009,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1859063357114792,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.8479529023170471,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3691229820251465,
|
|
"kl": 1.845133326947689,
|
|
"learning_rate": 1.0666666666666667e-05,
|
|
"loss": 0.0738,
|
|
"num_tokens": 1001717.0,
|
|
"reward": -1.5100492238998413,
|
|
"reward_std": 2.2933003902435303,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.794199526309967,
|
|
"rewards/env_reward/std": 1.4705337285995483,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 405
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 18.25,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.203,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.1297660768032074,
|
|
"kl": 1.9939737766981125,
|
|
"learning_rate": 1.0555555555555555e-05,
|
|
"loss": 0.0798,
|
|
"num_tokens": 1004190.0,
|
|
"reward": 0.8893073201179504,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7762049436569214,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 24.5,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2035,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.20842799544334412,
|
|
"kl": 1.046971783041954,
|
|
"learning_rate": 1.0444444444444445e-05,
|
|
"loss": 0.0419,
|
|
"num_tokens": 1006688.0,
|
|
"reward": 0.3194568157196045,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.39630457758903503,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.204,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.02985587902367115,
|
|
"kl": 0.5202329754829407,
|
|
"learning_rate": 1.0333333333333333e-05,
|
|
"loss": 0.0208,
|
|
"num_tokens": 1008896.0,
|
|
"reward": 1.842806100845337,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.1952041387557983,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2045,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.0121374130249023,
|
|
"kl": 1.4389217272400856,
|
|
"learning_rate": 1.0222222222222223e-05,
|
|
"loss": 0.0576,
|
|
"num_tokens": 1011378.0,
|
|
"reward": 0.06736606359481812,
|
|
"reward_std": 0.008096039295196533,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10073399543762207,
|
|
"rewards/belief_accuracy/std": 0.0014679878950119019,
|
|
"rewards/env_reward/mean": 0.22554537653923035,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 409
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 15.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.205,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 6.176392078399658,
|
|
"kl": 3.0382063947618008,
|
|
"learning_rate": 1.0111111111111111e-05,
|
|
"loss": 0.1215,
|
|
"num_tokens": 1013594.0,
|
|
"reward": 1.1761062145233154,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7507375478744507,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.2055,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.10501120984554291,
|
|
"kl": 1.108486369252205,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.0443,
|
|
"num_tokens": 1016088.0,
|
|
"reward": 1.6641442775726318,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.2927628755569458,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 9.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.206,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7421345710754395,
|
|
"kl": 1.0211132764816284,
|
|
"learning_rate": 9.888888888888889e-06,
|
|
"loss": 0.0408,
|
|
"num_tokens": 1018571.0,
|
|
"reward": -0.005715020000934601,
|
|
"reward_std": 0.26651811599731445,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.05766364932060242,
|
|
"rewards/belief_accuracy/std": 0.084672711789608,
|
|
"rewards/env_reward/mean": 0.0906839445233345,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 412
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.120497226715088,
|
|
"kl": 1.77897572144866,
|
|
"learning_rate": 9.777777777777779e-06,
|
|
"loss": 0.0712,
|
|
"num_tokens": 1020803.0,
|
|
"reward": -0.13511165976524353,
|
|
"reward_std": 0.4962805509567261,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.12340778857469559,
|
|
"rewards/env_reward/std": 0.33085373044013977,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.207,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.10685396194458,
|
|
"kl": 0.5586464405059814,
|
|
"learning_rate": 9.666666666666667e-06,
|
|
"loss": 0.0223,
|
|
"num_tokens": 1023235.0,
|
|
"reward": 0.6811659336090088,
|
|
"reward_std": 0.055555541068315506,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1930294632911682,
|
|
"rewards/belief_accuracy/std": 0.018518514931201935,
|
|
"rewards/env_reward/mean": 0.8068363070487976,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 15.666666984558105,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.2075,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.09224820137023926,
|
|
"kl": 1.9065433144569397,
|
|
"learning_rate": 9.555555555555556e-06,
|
|
"loss": 0.0763,
|
|
"num_tokens": 1025714.0,
|
|
"reward": 0.590803325176239,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5772022604942322,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 18.0,
|
|
"completions/mean_terminated_length": 13.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.208,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.12466186285018921,
|
|
"kl": 1.163508489727974,
|
|
"learning_rate": 9.444444444444445e-06,
|
|
"loss": 0.0465,
|
|
"num_tokens": 1028186.0,
|
|
"reward": 1.1605198383331299,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9570131897926331,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 22.33333396911621,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.2085,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.09261281788349152,
|
|
"kl": 1.1156965792179108,
|
|
"learning_rate": 9.333333333333334e-06,
|
|
"loss": 0.0446,
|
|
"num_tokens": 1030685.0,
|
|
"reward": -1.266386866569519,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.6609245538711548,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 13.5,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.209,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7526919841766357,
|
|
"kl": 1.766396388411522,
|
|
"learning_rate": 9.222222222222222e-06,
|
|
"loss": 0.0707,
|
|
"num_tokens": 1033139.0,
|
|
"reward": 1.3138718605041504,
|
|
"reward_std": 0.20983462035655975,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.05388889089226723,
|
|
"rewards/belief_accuracy/std": 0.06600598990917206,
|
|
"rewards/env_reward/mean": 0.9586923718452454,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.75,
|
|
"rewards/format_valid/std": 0.28867512941360474,
|
|
"step": 418
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.2095,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7062177658081055,
|
|
"kl": 1.5106076151132584,
|
|
"learning_rate": 9.111111111111112e-06,
|
|
"loss": 0.0604,
|
|
"num_tokens": 1035612.0,
|
|
"reward": -0.5769614577293396,
|
|
"reward_std": 0.08749997615814209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": -0.1721409559249878,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 419
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.21,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.923778772354126,
|
|
"kl": 1.7271861359477043,
|
|
"learning_rate": 9e-06,
|
|
"loss": 0.0691,
|
|
"num_tokens": 1038074.0,
|
|
"reward": 0.25312697887420654,
|
|
"reward_std": 0.15385065972805023,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07644156366586685,
|
|
"rewards/belief_accuracy/std": 0.047116879373788834,
|
|
"rewards/env_reward/mean": 0.3008011281490326,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 420
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 14.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 10.5,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2105,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.066768169403076,
|
|
"kl": 2.71373450756073,
|
|
"learning_rate": 8.88888888888889e-06,
|
|
"loss": 0.1085,
|
|
"num_tokens": 1040516.0,
|
|
"reward": -0.3499433994293213,
|
|
"reward_std": 3.066704273223877,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.020795553922653198,
|
|
"rewards/env_reward/std": 1.9861361980438232,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 421
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 17.75,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.211,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.10381243377923965,
|
|
"kl": 1.082069344818592,
|
|
"learning_rate": 8.777777777777778e-06,
|
|
"loss": 0.0433,
|
|
"num_tokens": 1042987.0,
|
|
"reward": -0.5439499616622925,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.179299995303154,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 14.75,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.2115,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.11354008316993713,
|
|
"kl": 1.3446319997310638,
|
|
"learning_rate": 8.666666666666668e-06,
|
|
"loss": 0.0538,
|
|
"num_tokens": 1045446.0,
|
|
"reward": -0.7405999898910522,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.31040000915527344,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 31.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 31.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.212,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.14288963377475739,
|
|
"kl": 1.392886459827423,
|
|
"learning_rate": 8.555555555555556e-06,
|
|
"loss": 0.0557,
|
|
"num_tokens": 1047922.0,
|
|
"reward": 1.2015879154205322,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9843919277191162,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2125,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.17405332624912262,
|
|
"kl": 1.4787537753582,
|
|
"learning_rate": 8.444444444444446e-06,
|
|
"loss": 0.0592,
|
|
"num_tokens": 1050410.0,
|
|
"reward": 2.462561845779419,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.8250410556793213,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 22.0,
|
|
"completions/mean_terminated_length": 18.666667938232422,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.213,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.2080841064453125,
|
|
"kl": 1.0187080278992653,
|
|
"learning_rate": 8.333333333333334e-06,
|
|
"loss": 0.0407,
|
|
"num_tokens": 1052898.0,
|
|
"reward": 0.6277508735656738,
|
|
"reward_std": 0.5285455584526062,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.21416667103767395,
|
|
"rewards/belief_accuracy/std": 0.008333340287208557,
|
|
"rewards/env_reward/mean": 0.8135005235671997,
|
|
"rewards/env_reward/std": 0.3554683029651642,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 20.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2135,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.2846176624298096,
|
|
"kl": 1.8003827184438705,
|
|
"learning_rate": 8.222222222222223e-06,
|
|
"loss": 0.072,
|
|
"num_tokens": 1055354.0,
|
|
"reward": -0.048611536622047424,
|
|
"reward_std": 0.3375000059604645,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.15833333134651184,
|
|
"rewards/belief_accuracy/std": 0.11666666716337204,
|
|
"rewards/env_reward/mean": 0.26342564821243286,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 427
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.214,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.872238278388977,
|
|
"kl": 1.2767575085163116,
|
|
"learning_rate": 8.111111111111112e-06,
|
|
"loss": 0.0511,
|
|
"num_tokens": 1057827.0,
|
|
"reward": -0.1957390159368515,
|
|
"reward_std": 0.0825425460934639,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.12407407909631729,
|
|
"rewards/belief_accuracy/std": 0.03164445981383324,
|
|
"rewards/env_reward/mean": 0.0926554724574089,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.75,
|
|
"rewards/format_valid/std": 0.28867512941360474,
|
|
"step": 428
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.2145,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.09198321402072906,
|
|
"kl": 1.0719245225191116,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 0.0429,
|
|
"num_tokens": 1060309.0,
|
|
"reward": -1.1536386013031006,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5857589840888977,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.215,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5375423431396484,
|
|
"kl": 1.4234507977962494,
|
|
"learning_rate": 7.88888888888889e-06,
|
|
"loss": 0.0569,
|
|
"num_tokens": 1062804.0,
|
|
"reward": -0.4885933995246887,
|
|
"reward_std": 2.974423408508301,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10740740597248077,
|
|
"rewards/belief_accuracy/std": 0.014814812690019608,
|
|
"rewards/env_reward/mean": -0.10258075594902039,
|
|
"rewards/env_reward/std": 1.9316128492355347,
|
|
"rewards/format_valid/mean": 0.0,
|
|
"rewards/format_valid/std": 1.3540064096450806,
|
|
"step": 430
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 11.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 8.75,
|
|
"completions/mean_terminated_length": 8.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2155,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5626556873321533,
|
|
"kl": 1.6539364457130432,
|
|
"learning_rate": 7.777777777777777e-06,
|
|
"loss": 0.0662,
|
|
"num_tokens": 1065015.0,
|
|
"reward": 1.4107515811920166,
|
|
"reward_std": 0.4393588900566101,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9071676731109619,
|
|
"rewards/env_reward/std": 0.2929059565067291,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 9.0,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.216,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.38476383686065674,
|
|
"kl": 3.078851878643036,
|
|
"learning_rate": 7.666666666666667e-06,
|
|
"loss": 0.1232,
|
|
"num_tokens": 1067451.0,
|
|
"reward": 0.9827893376350403,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.8385262489318848,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2165,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.14623303711414337,
|
|
"kl": 1.1345348209142685,
|
|
"learning_rate": 7.555555555555556e-06,
|
|
"loss": 0.0454,
|
|
"num_tokens": 1069941.0,
|
|
"reward": 1.2882843017578125,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.042189598083496,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.217,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.191251754760742,
|
|
"kl": 1.4331532195210457,
|
|
"learning_rate": 7.444444444444444e-06,
|
|
"loss": 0.0573,
|
|
"num_tokens": 1072149.0,
|
|
"reward": 0.511351466178894,
|
|
"reward_std": 0.19517327845096588,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.30756765604019165,
|
|
"rewards/env_reward/std": 0.13011550903320312,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 26.0,
|
|
"completions/mean_terminated_length": 20.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.2175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.433861017227173,
|
|
"kl": 1.154853031039238,
|
|
"learning_rate": 7.333333333333334e-06,
|
|
"loss": 0.0462,
|
|
"num_tokens": 1074653.0,
|
|
"reward": 1.1034250259399414,
|
|
"reward_std": 0.7286821603775024,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10551381856203079,
|
|
"rewards/belief_accuracy/std": 0.04305478185415268,
|
|
"rewards/env_reward/mean": 0.9174776673316956,
|
|
"rewards/env_reward/std": 0.4061686098575592,
|
|
"rewards/format_valid/mean": 0.875,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 435
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 19.33333396911621,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.218,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.4158477783203125,
|
|
"kl": 1.3408091366291046,
|
|
"learning_rate": 7.222222222222222e-06,
|
|
"loss": 0.0536,
|
|
"num_tokens": 1077143.0,
|
|
"reward": -1.1806925535202026,
|
|
"reward_std": 2.512871503829956,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5746283531188965,
|
|
"rewards/env_reward/std": 1.6169143915176392,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 436
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 26.75,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.2185,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.19002826511859894,
|
|
"kl": 0.8864307105541229,
|
|
"learning_rate": 7.111111111111112e-06,
|
|
"loss": 0.0355,
|
|
"num_tokens": 1079650.0,
|
|
"reward": 0.4842817187309265,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5061878561973572,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 18.75,
|
|
"completions/mean_terminated_length": 14.333333969116211,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.219,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.25234878063201904,
|
|
"kl": 1.5277681648731232,
|
|
"learning_rate": 7.000000000000001e-06,
|
|
"loss": 0.0611,
|
|
"num_tokens": 1082125.0,
|
|
"reward": 1.2354192733764648,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.0069462060928345,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 19.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2195,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 2.109527587890625,
|
|
"kl": 2.2620955407619476,
|
|
"learning_rate": 6.888888888888889e-06,
|
|
"loss": 0.0905,
|
|
"num_tokens": 1084577.0,
|
|
"reward": -0.516443133354187,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.16096210479736328,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 20.33333396911621,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.22,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.669363021850586,
|
|
"kl": 1.6046917140483856,
|
|
"learning_rate": 6.777777777777779e-06,
|
|
"loss": 0.0642,
|
|
"num_tokens": 1087070.0,
|
|
"reward": 0.5990688800811768,
|
|
"reward_std": 0.2774999737739563,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.05583333224058151,
|
|
"rewards/belief_accuracy/std": 0.08833333104848862,
|
|
"rewards/env_reward/mean": 0.490212619304657,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 440
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 20.5,
|
|
"completions/mean_terminated_length": 16.666667938232422,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2205,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.10930506885051727,
|
|
"kl": 1.5687530785799026,
|
|
"learning_rate": 6.666666666666667e-06,
|
|
"loss": 0.0628,
|
|
"num_tokens": 1089552.0,
|
|
"reward": 0.4188321828842163,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.46255481243133545,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 14.5,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.221,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.11199460923671722,
|
|
"kl": 1.3371323496103287,
|
|
"learning_rate": 6.555555555555556e-06,
|
|
"loss": 0.0535,
|
|
"num_tokens": 1092045.0,
|
|
"reward": 1.3014123439788818,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.0509415864944458,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 22.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 13.75,
|
|
"completions/mean_terminated_length": 13.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2215,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5043163299560547,
|
|
"kl": 2.0148645490407944,
|
|
"learning_rate": 6.4444444444444445e-06,
|
|
"loss": 0.0806,
|
|
"num_tokens": 1094500.0,
|
|
"reward": -0.14020271599292755,
|
|
"reward_std": 0.3196193277835846,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.03164725750684738,
|
|
"rewards/belief_accuracy/std": 0.0914727970957756,
|
|
"rewards/env_reward/mean": -0.055173955857753754,
|
|
"rewards/env_reward/std": 0.07499999552965164,
|
|
"rewards/format_valid/mean": 0.75,
|
|
"rewards/format_valid/std": 0.28867512941360474,
|
|
"step": 443
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 14.666666984558105,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.222,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.22092895209789276,
|
|
"kl": 1.370383344590664,
|
|
"learning_rate": 6.333333333333334e-06,
|
|
"loss": 0.0548,
|
|
"num_tokens": 1096976.0,
|
|
"reward": 1.091750144958496,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.911166787147522,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 10.666666984558105,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2225,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.20150171220302582,
|
|
"kl": 1.3967806994915009,
|
|
"learning_rate": 6.222222222222222e-06,
|
|
"loss": 0.0559,
|
|
"num_tokens": 1099440.0,
|
|
"reward": 0.020131230354309082,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.19675415754318237,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.223,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.15836045145988464,
|
|
"kl": 2.047119751572609,
|
|
"learning_rate": 6.111111111111111e-06,
|
|
"loss": 0.0819,
|
|
"num_tokens": 1101904.0,
|
|
"reward": 0.2909669280052185,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3773113191127777,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 24.75,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.2235,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.17508073151111603,
|
|
"kl": 1.1480788886547089,
|
|
"learning_rate": 6e-06,
|
|
"loss": 0.0459,
|
|
"num_tokens": 1104403.0,
|
|
"reward": 1.1857414245605469,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9738277196884155,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 21.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.224,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.20271523296833038,
|
|
"kl": 1.4859704226255417,
|
|
"learning_rate": 5.888888888888889e-06,
|
|
"loss": 0.0594,
|
|
"num_tokens": 1106863.0,
|
|
"reward": -0.151106059551239,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.08259596675634384,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 30.0,
|
|
"completions/mean_length": 21.5,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.2245,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.14578959345817566,
|
|
"kl": 1.7562050223350525,
|
|
"learning_rate": 5.777777777777778e-06,
|
|
"loss": 0.0702,
|
|
"num_tokens": 1109349.0,
|
|
"reward": -0.9775741100311279,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.46838271617889404,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 22.5,
|
|
"completions/mean_terminated_length": 19.33333396911621,
|
|
"completions/min_length": 17.0,
|
|
"completions/min_terminated_length": 17.0,
|
|
"epoch": 0.225,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.16451287269592285,
|
|
"kl": 1.11443629860878,
|
|
"learning_rate": 5.666666666666667e-06,
|
|
"loss": 0.0446,
|
|
"num_tokens": 1111839.0,
|
|
"reward": 0.36123454570770264,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.42415639758110046,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 18.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.2255,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.42332857847213745,
|
|
"kl": 1.8851738721132278,
|
|
"learning_rate": 5.555555555555556e-06,
|
|
"loss": 0.0754,
|
|
"num_tokens": 1114291.0,
|
|
"reward": 0.47187477350234985,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.4979165196418762,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 26.75,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.226,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.978724718093872,
|
|
"kl": 1.1868759840726852,
|
|
"learning_rate": 5.444444444444445e-06,
|
|
"loss": 0.0475,
|
|
"num_tokens": 1116798.0,
|
|
"reward": 1.1072639226913452,
|
|
"reward_std": 0.21830104291439056,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9215093851089478,
|
|
"rewards/env_reward/std": 0.14553406834602356,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 16.0,
|
|
"completions/mean_terminated_length": 16.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2265,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.23165586590766907,
|
|
"kl": 1.9865920096635818,
|
|
"learning_rate": 5.333333333333334e-06,
|
|
"loss": 0.0795,
|
|
"num_tokens": 1119262.0,
|
|
"reward": -0.4410591125488281,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.11070608347654343,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.227,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.738908290863037,
|
|
"kl": 1.3586938455700874,
|
|
"learning_rate": 5.2222222222222226e-06,
|
|
"loss": 0.0543,
|
|
"num_tokens": 1121756.0,
|
|
"reward": 0.3270404040813446,
|
|
"reward_std": 0.1799306422472,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.40136027336120605,
|
|
"rewards/env_reward/std": 0.11995376646518707,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.2275,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.68782901763916,
|
|
"kl": 2.2840545773506165,
|
|
"learning_rate": 5.1111111111111115e-06,
|
|
"loss": 0.0914,
|
|
"num_tokens": 1124196.0,
|
|
"reward": 0.7064803242683411,
|
|
"reward_std": 0.6930884122848511,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.654320240020752,
|
|
"rewards/env_reward/std": 0.4620589315891266,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 19.5,
|
|
"completions/mean_terminated_length": 15.333333969116211,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.228,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.14986330270767212,
|
|
"kl": 1.6589103937149048,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0664,
|
|
"num_tokens": 1126674.0,
|
|
"reward": 1.4591007232666016,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.156067132949829,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 25.0,
|
|
"completions/mean_terminated_length": 18.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2285,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5412628650665283,
|
|
"kl": 0.8467085063457489,
|
|
"learning_rate": 4.888888888888889e-06,
|
|
"loss": 0.0339,
|
|
"num_tokens": 1129174.0,
|
|
"reward": 0.6049916744232178,
|
|
"reward_std": 0.3218177258968353,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.586661159992218,
|
|
"rewards/env_reward/std": 0.21454516053199768,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 19.5,
|
|
"completions/mean_terminated_length": 15.333333969116211,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.229,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.17499062418937683,
|
|
"kl": 1.236129179596901,
|
|
"learning_rate": 4.777777777777778e-06,
|
|
"loss": 0.0494,
|
|
"num_tokens": 1131652.0,
|
|
"reward": -0.4889889657497406,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.14265930652618408,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2295,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.23451776802539825,
|
|
"kl": 1.0196955502033234,
|
|
"learning_rate": 4.666666666666667e-06,
|
|
"loss": 0.0408,
|
|
"num_tokens": 1134147.0,
|
|
"reward": 0.9018483757972717,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7845656275749207,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.23,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.290233314037323,
|
|
"kl": 0.6138657331466675,
|
|
"learning_rate": 4.555555555555556e-06,
|
|
"loss": 0.0246,
|
|
"num_tokens": 1136355.0,
|
|
"reward": 1.1506599187850952,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.7337732911109924,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 14.75,
|
|
"completions/mean_terminated_length": 14.75,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.2305,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.5567545890808105,
|
|
"kl": 1.081925056874752,
|
|
"learning_rate": 4.444444444444445e-06,
|
|
"loss": 0.0433,
|
|
"num_tokens": 1138814.0,
|
|
"reward": -0.3285835385322571,
|
|
"reward_std": 3.080944299697876,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.006555706262588501,
|
|
"rewards/env_reward/std": 1.9956294298171997,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 461
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.231,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.20140407979488373,
|
|
"kl": 1.9097826182842255,
|
|
"learning_rate": 4.333333333333334e-06,
|
|
"loss": 0.0764,
|
|
"num_tokens": 1141280.0,
|
|
"reward": 0.6215116381645203,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5976744294166565,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 22.25,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.2315,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.226417675614357,
|
|
"kl": 1.086095541715622,
|
|
"learning_rate": 4.222222222222223e-06,
|
|
"loss": 0.0434,
|
|
"num_tokens": 1143769.0,
|
|
"reward": 1.0261733531951904,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.8674488663673401,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 23.0,
|
|
"completions/mean_terminated_length": 23.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.232,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3003480434417725,
|
|
"kl": 1.2525041699409485,
|
|
"learning_rate": 4.111111111111112e-06,
|
|
"loss": 0.0501,
|
|
"num_tokens": 1146261.0,
|
|
"reward": 0.5109961032867432,
|
|
"reward_std": 0.11840394884347916,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10109934210777283,
|
|
"rewards/belief_accuracy/std": 0.002198692411184311,
|
|
"rewards/env_reward/mean": 0.5220293998718262,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 464
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 19.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.2325,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.3115181028842926,
|
|
"kl": 1.4747485369443893,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 0.059,
|
|
"num_tokens": 1148737.0,
|
|
"reward": -1.0867280960083008,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.5411520600318909,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 6.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.233,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.24885617196559906,
|
|
"kl": 1.9252333343029022,
|
|
"learning_rate": 3.888888888888889e-06,
|
|
"loss": 0.077,
|
|
"num_tokens": 1151161.0,
|
|
"reward": 0.2553900480270386,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.3535933792591095,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 20.75,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.2335,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.16646219789981842,
|
|
"kl": 1.6244457215070724,
|
|
"learning_rate": 3.777777777777778e-06,
|
|
"loss": 0.065,
|
|
"num_tokens": 1153644.0,
|
|
"reward": 1.1453707218170166,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.9469137787818909,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 19.25,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.234,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.400774955749512,
|
|
"kl": 1.2703840136528015,
|
|
"learning_rate": 3.666666666666667e-06,
|
|
"loss": 0.0508,
|
|
"num_tokens": 1156121.0,
|
|
"reward": -0.3506682515144348,
|
|
"reward_std": 0.22495710849761963,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -0.050445497035980225,
|
|
"rewards/env_reward/std": 0.14997142553329468,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 17.75,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2345,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.9656176567077637,
|
|
"kl": 1.1122565567493439,
|
|
"learning_rate": 3.555555555555556e-06,
|
|
"loss": 0.0445,
|
|
"num_tokens": 1158592.0,
|
|
"reward": 0.20864993333816528,
|
|
"reward_std": 0.5419493913650513,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.32243332266807556,
|
|
"rewards/env_reward/std": 0.36129963397979736,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 25.5,
|
|
"completions/mean_terminated_length": 19.0,
|
|
"completions/min_length": 18.0,
|
|
"completions/min_terminated_length": 18.0,
|
|
"epoch": 0.235,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.9808571338653564,
|
|
"kl": 1.142757073044777,
|
|
"learning_rate": 3.4444444444444444e-06,
|
|
"loss": 0.0457,
|
|
"num_tokens": 1161094.0,
|
|
"reward": 0.7835899591445923,
|
|
"reward_std": 0.17083337903022766,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.06111111119389534,
|
|
"rewards/belief_accuracy/std": 0.07777778059244156,
|
|
"rewards/env_reward/mean": 0.6237821578979492,
|
|
"rewards/env_reward/std": 0.04999998211860657,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 470
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 12.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 11.0,
|
|
"completions/mean_terminated_length": 11.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2355,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.19158995151519775,
|
|
"kl": 1.6587612330913544,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 0.0664,
|
|
"num_tokens": 1163538.0,
|
|
"reward": -0.2371583878993988,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.02522774413228035,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 21.25,
|
|
"completions/mean_terminated_length": 10.5,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.236,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.27240458130836487,
|
|
"kl": 0.896557942032814,
|
|
"learning_rate": 3.2222222222222222e-06,
|
|
"loss": 0.0359,
|
|
"num_tokens": 1166023.0,
|
|
"reward": 1.0796802043914795,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.903120219707489,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 25.25,
|
|
"completions/mean_terminated_length": 18.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.2365,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.825577735900879,
|
|
"kl": 0.8296048268675804,
|
|
"learning_rate": 3.111111111111111e-06,
|
|
"loss": 0.0332,
|
|
"num_tokens": 1168524.0,
|
|
"reward": 1.6908910274505615,
|
|
"reward_std": 0.2124999761581421,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.06666666269302368,
|
|
"rewards/belief_accuracy/std": 0.06666667014360428,
|
|
"rewards/env_reward/mean": 1.2397607564926147,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 473
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 19.5,
|
|
"completions/mean_terminated_length": 15.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.237,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.382891893386841,
|
|
"kl": 1.113198146224022,
|
|
"learning_rate": 3e-06,
|
|
"loss": 0.0445,
|
|
"num_tokens": 1171002.0,
|
|
"reward": 1.2636022567749023,
|
|
"reward_std": 0.38985100388526917,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.0257349014282227,
|
|
"rewards/env_reward/std": 0.2599007189273834,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.8809125423431396,
|
|
"kl": 1.4048431143164635,
|
|
"learning_rate": 2.888888888888889e-06,
|
|
"loss": 0.0562,
|
|
"num_tokens": 1173489.0,
|
|
"reward": -0.06511135399341583,
|
|
"reward_std": 0.047866158187389374,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.07713063061237335,
|
|
"rewards/belief_accuracy/std": 0.045738738030195236,
|
|
"rewards/env_reward/mean": 0.09002035856246948,
|
|
"rewards/env_reward/std": 0.06790003925561905,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 475
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 14.75,
|
|
"completions/mean_terminated_length": 9.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.238,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7238736152648926,
|
|
"kl": 1.2160827964544296,
|
|
"learning_rate": 2.777777777777778e-06,
|
|
"loss": 0.0486,
|
|
"num_tokens": 1175948.0,
|
|
"reward": -0.18826928734779358,
|
|
"reward_std": 0.18310457468032837,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0715659037232399,
|
|
"rewards/belief_accuracy/std": 0.05686819180846214,
|
|
"rewards/env_reward/mean": -0.0032143734861165285,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 476
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 18.75,
|
|
"completions/mean_terminated_length": 14.333333969116211,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2385,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.449535846710205,
|
|
"kl": 1.1292494237422943,
|
|
"learning_rate": 2.666666666666667e-06,
|
|
"loss": 0.0452,
|
|
"num_tokens": 1178423.0,
|
|
"reward": 0.36471137404441833,
|
|
"reward_std": 0.32429030537605286,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.04803495109081268,
|
|
"rewards/belief_accuracy/std": 0.10393010079860687,
|
|
"rewards/env_reward/mean": 0.3183774948120117,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 477
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 23.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.239,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.696188926696777,
|
|
"kl": 1.435573399066925,
|
|
"learning_rate": 2.5555555555555557e-06,
|
|
"loss": 0.0574,
|
|
"num_tokens": 1180889.0,
|
|
"reward": 1.4060673713684082,
|
|
"reward_std": 0.02499997615814209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 1.1498782634735107,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 478
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 26.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2395,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.0978991985321045,
|
|
"kl": 1.2610341310501099,
|
|
"learning_rate": 2.4444444444444447e-06,
|
|
"loss": 0.0504,
|
|
"num_tokens": 1183393.0,
|
|
"reward": -0.08134973049163818,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.12910018861293793,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 9.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.24,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.3614814281463623,
|
|
"kl": 1.5769162476062775,
|
|
"learning_rate": 2.3333333333333336e-06,
|
|
"loss": 0.0631,
|
|
"num_tokens": 1185826.0,
|
|
"reward": -0.26373326778411865,
|
|
"reward_std": 1.0362647771835327,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.1836622953414917,
|
|
"rewards/belief_accuracy/std": 0.10475655645132065,
|
|
"rewards/env_reward/mean": 0.15816909074783325,
|
|
"rewards/env_reward/std": 0.49835386872291565,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 27.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 15.0,
|
|
"completions/mean_terminated_length": 15.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.2405,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.7180566787719727,
|
|
"kl": 1.8103009164333344,
|
|
"learning_rate": 2.2222222222222225e-06,
|
|
"loss": 0.0724,
|
|
"num_tokens": 1188286.0,
|
|
"reward": 0.1593647599220276,
|
|
"reward_std": 0.6118594408035278,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.28957653045654297,
|
|
"rewards/env_reward/std": 0.40790632367134094,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 23.25,
|
|
"completions/mean_terminated_length": 20.33333396911621,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.241,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.3375169038772583,
|
|
"kl": 0.9421973675489426,
|
|
"learning_rate": 2.1111111111111114e-06,
|
|
"loss": 0.0377,
|
|
"num_tokens": 1190779.0,
|
|
"reward": 0.6196067929267883,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.5964045524597168,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 16.0,
|
|
"epoch": 0.2415,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2469029277563095,
|
|
"kl": 1.1642567813396454,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 0.0466,
|
|
"num_tokens": 1193274.0,
|
|
"reward": -0.17581841349601746,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.06612106412649155,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 20.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 13.25,
|
|
"completions/mean_terminated_length": 13.25,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.242,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.307186603546143,
|
|
"kl": 1.7343009114265442,
|
|
"learning_rate": 1.888888888888889e-06,
|
|
"loss": 0.0694,
|
|
"num_tokens": 1195727.0,
|
|
"reward": 1.351982831954956,
|
|
"reward_std": 0.02499997615814209,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.11666667461395264,
|
|
"rewards/belief_accuracy/std": 0.03333333507180214,
|
|
"rewards/env_reward/mean": 1.1138218641281128,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 484
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 31.0,
|
|
"completions/mean_length": 26.75,
|
|
"completions/mean_terminated_length": 25.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2425,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.24575987458229065,
|
|
"kl": 1.0231992602348328,
|
|
"learning_rate": 1.777777777777778e-06,
|
|
"loss": 0.0409,
|
|
"num_tokens": 1198234.0,
|
|
"reward": -0.02136892080307007,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.16908739507198334,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 15.25,
|
|
"completions/mean_terminated_length": 9.666666984558105,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.243,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.336447715759277,
|
|
"kl": 1.3090898543596268,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 0.0524,
|
|
"num_tokens": 1200695.0,
|
|
"reward": -1.9136090278625488,
|
|
"reward_std": 2.0242605209350586,
|
|
"rewards/action_legal/mean": -0.25,
|
|
"rewards/action_legal/std": 0.5,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": -1.063239336013794,
|
|
"rewards/env_reward/std": 1.2911738157272339,
|
|
"rewards/format_valid/mean": -0.125,
|
|
"rewards/format_valid/std": 1.25,
|
|
"step": 486
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.2435,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.26777753233909607,
|
|
"kl": 1.0288607757538557,
|
|
"learning_rate": 1.5555555555555556e-06,
|
|
"loss": 0.0412,
|
|
"num_tokens": 1203157.0,
|
|
"reward": 1.5726299285888672,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.2317533493041992,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 32.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.244,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.23503462970256805,
|
|
"kl": 0.559322252869606,
|
|
"learning_rate": 1.4444444444444445e-06,
|
|
"loss": 0.0224,
|
|
"num_tokens": 1205685.0,
|
|
"reward": 1.3888003826141357,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.109200358390808,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 18.0,
|
|
"completions/mean_terminated_length": 13.333333969116211,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.2445,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.5624420642852783,
|
|
"kl": 1.6142105609178543,
|
|
"learning_rate": 1.3333333333333334e-06,
|
|
"loss": 0.0646,
|
|
"num_tokens": 1208157.0,
|
|
"reward": 0.8933224678039551,
|
|
"reward_std": 0.2008855640888214,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.08735239505767822,
|
|
"rewards/belief_accuracy/std": 0.025295214727520943,
|
|
"rewards/env_reward/mean": 0.7494198083877563,
|
|
"rewards/env_reward/std": 0.07499998807907104,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 489
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 26.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.245,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.3598785996437073,
|
|
"kl": 0.725849099457264,
|
|
"learning_rate": 1.2222222222222223e-06,
|
|
"loss": 0.029,
|
|
"num_tokens": 1210663.0,
|
|
"reward": 1.589991807937622,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.2433278560638428,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 32.0,
|
|
"completions/mean_length": 17.25,
|
|
"completions/mean_terminated_length": 17.25,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.2455,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.048388719558716,
|
|
"kl": 0.6919376142323017,
|
|
"learning_rate": 1.1111111111111112e-06,
|
|
"loss": 0.0277,
|
|
"num_tokens": 1213132.0,
|
|
"reward": -0.04184141755104065,
|
|
"reward_std": 0.977747917175293,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.1554390788078308,
|
|
"rewards/env_reward/std": 0.6518319249153137,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 23.75,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.246,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.5983917713165283,
|
|
"kl": 1.0341777577996254,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 0.0414,
|
|
"num_tokens": 1215627.0,
|
|
"reward": 0.6422335505485535,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.6114890575408936,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 8.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2465,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.7554574012756348,
|
|
"kl": 0.5243086963891983,
|
|
"learning_rate": 8.88888888888889e-07,
|
|
"loss": 0.021,
|
|
"num_tokens": 1217835.0,
|
|
"reward": 1.290155291557312,
|
|
"reward_std": 0.05980373173952103,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": 0.0,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.8267701864242554,
|
|
"rewards/env_reward/std": 0.039869144558906555,
|
|
"rewards/format_valid/mean": 1.0,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 17.0,
|
|
"completions/mean_length": 18.25,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.247,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.632870197296143,
|
|
"kl": 1.0271499007940292,
|
|
"learning_rate": 7.777777777777778e-07,
|
|
"loss": 0.0411,
|
|
"num_tokens": 1220308.0,
|
|
"reward": 0.16512584686279297,
|
|
"reward_std": 0.20950853824615479,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.2934172749519348,
|
|
"rewards/env_reward/std": 0.13967236876487732,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 25.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 15.75,
|
|
"completions/mean_terminated_length": 15.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2475,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.19535107910633087,
|
|
"kl": 1.8744446635246277,
|
|
"learning_rate": 6.666666666666667e-07,
|
|
"loss": 0.075,
|
|
"num_tokens": 1222771.0,
|
|
"reward": 1.4706852436065674,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.163790225982666,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 15.5,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.248,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.917879581451416,
|
|
"kl": 1.0786767601966858,
|
|
"learning_rate": 5.555555555555556e-07,
|
|
"loss": 0.0431,
|
|
"num_tokens": 1225233.0,
|
|
"reward": -0.2330722212791443,
|
|
"reward_std": 0.031944431364536285,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10740740597248077,
|
|
"rewards/belief_accuracy/std": 0.014814812690019608,
|
|
"rewards/env_reward/mean": 0.038600001484155655,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 496
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 28.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 28.0,
|
|
"completions/max_terminated_length": 28.0,
|
|
"completions/mean_length": 16.5,
|
|
"completions/mean_terminated_length": 16.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.2485,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.6436572074890137,
|
|
"kl": 1.1599683165550232,
|
|
"learning_rate": 4.444444444444445e-07,
|
|
"loss": 0.0464,
|
|
"num_tokens": 1227699.0,
|
|
"reward": 1.316786289215088,
|
|
"reward_std": 0.02361110784113407,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.0981481522321701,
|
|
"rewards/belief_accuracy/std": 0.0037037059664726257,
|
|
"rewards/env_reward/mean": 1.0533205270767212,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.625,
|
|
"rewards/format_valid/std": 0.25,
|
|
"step": 497
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 16.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.249,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.323911190032959,
|
|
"kl": 1.486757069826126,
|
|
"learning_rate": 3.3333333333333335e-07,
|
|
"loss": 0.0595,
|
|
"num_tokens": 1230145.0,
|
|
"reward": 1.410203218460083,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.1234688758850098,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 32.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 32.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 21.75,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.2495,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.010908603668213,
|
|
"kl": 0.8313806504011154,
|
|
"learning_rate": 2.2222222222222224e-07,
|
|
"loss": 0.0333,
|
|
"num_tokens": 1232632.0,
|
|
"reward": -0.182204470038414,
|
|
"reward_std": 0.07837501168251038,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 0.06186369061470032,
|
|
"rewards/env_reward/std": 0.052250005304813385,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 13.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 10.0,
|
|
"completions/mean_terminated_length": 10.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.25,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.2790248990058899,
|
|
"kl": 1.8732239753007889,
|
|
"learning_rate": 1.1111111111111112e-07,
|
|
"loss": 0.0749,
|
|
"num_tokens": 1235072.0,
|
|
"reward": 1.3762123584747314,
|
|
"reward_std": 0.0,
|
|
"rewards/action_legal/mean": 0.0,
|
|
"rewards/action_legal/std": 0.0,
|
|
"rewards/belief_accuracy/mean": -0.10000000149011612,
|
|
"rewards/belief_accuracy/std": 0.0,
|
|
"rewards/env_reward/mean": 1.1008082628250122,
|
|
"rewards/env_reward/std": 0.0,
|
|
"rewards/format_valid/mean": 0.5,
|
|
"rewards/format_valid/std": 0.0,
|
|
"step": 500
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 500,
|
|
"num_input_tokens_seen": 1235072,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 250,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|