1794 lines
73 KiB
JSON
1794 lines
73 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.008888888888888889,
|
|
"eval_steps": 500,
|
|
"global_step": 100,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 996.0,
|
|
"completions/mean_length": 579.78125,
|
|
"completions/mean_terminated_length": 516.3214416503906,
|
|
"completions/min_length": 165.5,
|
|
"completions/min_terminated_length": 165.5,
|
|
"entropy": 1.209167167544365,
|
|
"epoch": 0.00017777777777777779,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0451635456006854,
|
|
"kl": 0.0,
|
|
"learning_rate": 1.6666666666666665e-07,
|
|
"loss": 0.0857,
|
|
"num_tokens": 46890.0,
|
|
"reward": 0.234375,
|
|
"reward_std": 0.4299773871898651,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.21135568618774414,
|
|
"rewards/format_reward_func/mean": 0.1875,
|
|
"rewards/format_reward_func/std": 0.3965577781200409,
|
|
"sampling/importance_sampling_ratio/max": 2.733101010322571,
|
|
"sampling/importance_sampling_ratio/mean": 0.7042568325996399,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6484301090240479,
|
|
"sampling/sampling_logp_difference/mean": 0.023067950271070004,
|
|
"step": 2,
|
|
"step_time": 13.367690222221427
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 998.0,
|
|
"completions/mean_length": 511.703125,
|
|
"completions/mean_terminated_length": 486.53440856933594,
|
|
"completions/min_length": 149.0,
|
|
"completions/min_terminated_length": 149.0,
|
|
"entropy": 1.150175679475069,
|
|
"epoch": 0.00035555555555555557,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.3727134073809244,
|
|
"kl": 0.0004912198037345661,
|
|
"learning_rate": 5e-07,
|
|
"loss": 0.1102,
|
|
"num_tokens": 89431.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.45680341124534607,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.1767766922712326,
|
|
"rewards/format_reward_func/mean": 0.25,
|
|
"rewards/format_reward_func/std": 0.4399413466453552,
|
|
"sampling/importance_sampling_ratio/max": 2.4883073568344116,
|
|
"sampling/importance_sampling_ratio/mean": 0.6917034089565277,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.7433785200119019,
|
|
"sampling/sampling_logp_difference/mean": 0.02369655668735504,
|
|
"step": 4,
|
|
"step_time": 11.878318977192976
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.078125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 965.5,
|
|
"completions/mean_length": 543.25,
|
|
"completions/mean_terminated_length": 502.3490753173828,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"entropy": 1.2386715859174728,
|
|
"epoch": 0.0005333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4488715982157625,
|
|
"kl": 0.0005299820822983747,
|
|
"learning_rate": 4.994757065594279e-07,
|
|
"loss": 0.2079,
|
|
"num_tokens": 133991.0,
|
|
"reward": 0.265625,
|
|
"reward_std": 0.44547125697135925,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.12296734005212784,
|
|
"rewards/format_reward_func/mean": 0.234375,
|
|
"rewards/format_reward_func/std": 0.4299773871898651,
|
|
"sampling/importance_sampling_ratio/max": 2.242772936820984,
|
|
"sampling/importance_sampling_ratio/mean": 0.7461096942424774,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5018917918205261,
|
|
"sampling/sampling_logp_difference/mean": 0.02481890842318535,
|
|
"step": 6,
|
|
"step_time": 11.690943353110924
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 464.96875,
|
|
"completions/mean_terminated_length": 446.9354705810547,
|
|
"completions/min_length": 149.0,
|
|
"completions/min_terminated_length": 149.0,
|
|
"entropy": 1.145722646266222,
|
|
"epoch": 0.0007111111111111111,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.5202398474744325,
|
|
"kl": 0.0004978575179848121,
|
|
"learning_rate": 4.979050253066063e-07,
|
|
"loss": 0.0416,
|
|
"num_tokens": 173509.0,
|
|
"reward": 0.265625,
|
|
"reward_std": 0.44837237894535065,
|
|
"rewards/equation_reward_func/mean": 0.015625,
|
|
"rewards/equation_reward_func/std": 0.0883883461356163,
|
|
"rewards/format_reward_func/mean": 0.25,
|
|
"rewards/format_reward_func/std": 0.43840841948986053,
|
|
"sampling/importance_sampling_ratio/max": 1.867400348186493,
|
|
"sampling/importance_sampling_ratio/mean": 0.6012143194675446,
|
|
"sampling/importance_sampling_ratio/min": 0.08801080286502838,
|
|
"sampling/sampling_logp_difference/max": 0.5550211668014526,
|
|
"sampling/sampling_logp_difference/mean": 0.023779811337590218,
|
|
"step": 8,
|
|
"step_time": 11.771737957373261
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1023.5,
|
|
"completions/max_terminated_length": 999.0,
|
|
"completions/mean_length": 512.875,
|
|
"completions/mean_terminated_length": 496.10626220703125,
|
|
"completions/min_length": 166.5,
|
|
"completions/min_terminated_length": 166.5,
|
|
"entropy": 1.1838660538196564,
|
|
"epoch": 0.0008888888888888889,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.872521471943838,
|
|
"kl": 0.000666440657369094,
|
|
"learning_rate": 4.952945442245597e-07,
|
|
"loss": 0.0792,
|
|
"num_tokens": 216061.0,
|
|
"reward": 0.34375,
|
|
"reward_std": 0.5400001406669617,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.1480722874403,
|
|
"rewards/format_reward_func/mean": 0.296875,
|
|
"rewards/format_reward_func/std": 0.4638662487268448,
|
|
"sampling/importance_sampling_ratio/max": 2.8262017965316772,
|
|
"sampling/importance_sampling_ratio/mean": 0.6557013094425201,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.612011194229126,
|
|
"sampling/sampling_logp_difference/mean": 0.02344651333987713,
|
|
"step": 10,
|
|
"step_time": 11.873887687921524
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 918.0,
|
|
"completions/max_terminated_length": 915.0,
|
|
"completions/mean_length": 463.296875,
|
|
"completions/mean_terminated_length": 454.6552276611328,
|
|
"completions/min_length": 152.5,
|
|
"completions/min_terminated_length": 152.5,
|
|
"entropy": 1.1539763174951077,
|
|
"epoch": 0.0010666666666666667,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.5031066308786476,
|
|
"kl": 0.0008330309847224271,
|
|
"learning_rate": 4.916552125781528e-07,
|
|
"loss": 0.0814,
|
|
"num_tokens": 255432.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.5670737028121948,
|
|
"rewards/equation_reward_func/mean": 0.0625,
|
|
"rewards/equation_reward_func/std": 0.24593468010425568,
|
|
"rewards/format_reward_func/mean": 0.46875,
|
|
"rewards/format_reward_func/std": 0.507007360458374,
|
|
"sampling/importance_sampling_ratio/max": 2.125624656677246,
|
|
"sampling/importance_sampling_ratio/mean": 0.7945938110351562,
|
|
"sampling/importance_sampling_ratio/min": 0.01008664257824421,
|
|
"sampling/sampling_logp_difference/max": 0.6622226238250732,
|
|
"sampling/sampling_logp_difference/mean": 0.023904340341687202,
|
|
"step": 12,
|
|
"step_time": 11.331206714501604
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.078125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 895.5,
|
|
"completions/mean_length": 559.359375,
|
|
"completions/mean_terminated_length": 520.5178833007812,
|
|
"completions/min_length": 217.5,
|
|
"completions/min_terminated_length": 217.5,
|
|
"entropy": 1.2339040115475655,
|
|
"epoch": 0.0012444444444444445,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.3283721770836783,
|
|
"kl": 0.0012322509137447923,
|
|
"learning_rate": 4.870022949890676e-07,
|
|
"loss": 0.0188,
|
|
"num_tokens": 301031.0,
|
|
"reward": 0.421875,
|
|
"reward_std": 0.5294715315103531,
|
|
"rewards/equation_reward_func/mean": 0.015625,
|
|
"rewards/equation_reward_func/std": 0.0883883461356163,
|
|
"rewards/format_reward_func/mean": 0.40625,
|
|
"rewards/format_reward_func/std": 0.4979427307844162,
|
|
"sampling/importance_sampling_ratio/max": 2.192352533340454,
|
|
"sampling/importance_sampling_ratio/mean": 0.6494036614894867,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.9156644344329834,
|
|
"sampling/sampling_logp_difference/mean": 0.024133121594786644,
|
|
"step": 14,
|
|
"step_time": 11.763766457792372
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 949.0,
|
|
"completions/mean_length": 472.375,
|
|
"completions/mean_terminated_length": 454.58062744140625,
|
|
"completions/min_length": 152.5,
|
|
"completions/min_terminated_length": 152.5,
|
|
"entropy": 1.2082068026065826,
|
|
"epoch": 0.0014222222222222223,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.9893722134021776,
|
|
"kl": 0.002285250819113571,
|
|
"learning_rate": 4.81355307410676e-07,
|
|
"loss": -0.0683,
|
|
"num_tokens": 341063.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.5569138079881668,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.1480722874403,
|
|
"rewards/format_reward_func/mean": 0.453125,
|
|
"rewards/format_reward_func/std": 0.5034956932067871,
|
|
"sampling/importance_sampling_ratio/max": 2.4907275438308716,
|
|
"sampling/importance_sampling_ratio/mean": 0.8273349404335022,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.7957509160041809,
|
|
"sampling/sampling_logp_difference/mean": 0.023882606998085976,
|
|
"step": 16,
|
|
"step_time": 11.711858246591873
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 951.5,
|
|
"completions/max_terminated_length": 923.5,
|
|
"completions/mean_length": 427.71875,
|
|
"completions/mean_terminated_length": 418.4737854003906,
|
|
"completions/min_length": 135.5,
|
|
"completions/min_terminated_length": 135.5,
|
|
"entropy": 1.137543722987175,
|
|
"epoch": 0.0016,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.5075918715668166,
|
|
"kl": 0.002901972911786288,
|
|
"learning_rate": 4.747379352713488e-07,
|
|
"loss": 0.0204,
|
|
"num_tokens": 378157.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.6221709847450256,
|
|
"rewards/equation_reward_func/mean": 0.09375,
|
|
"rewards/equation_reward_func/std": 0.2961445748806,
|
|
"rewards/format_reward_func/mean": 0.53125,
|
|
"rewards/format_reward_func/std": 0.4907747954130173,
|
|
"sampling/importance_sampling_ratio/max": 2.210850417613983,
|
|
"sampling/importance_sampling_ratio/mean": 0.7625480890274048,
|
|
"sampling/importance_sampling_ratio/min": 0.005571374204009771,
|
|
"sampling/sampling_logp_difference/max": 0.5609427690505981,
|
|
"sampling/sampling_logp_difference/mean": 0.02348453551530838,
|
|
"step": 18,
|
|
"step_time": 11.282247766968794
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 899.0,
|
|
"completions/max_terminated_length": 825.5,
|
|
"completions/mean_length": 447.6875,
|
|
"completions/mean_terminated_length": 438.1275177001953,
|
|
"completions/min_length": 107.5,
|
|
"completions/min_terminated_length": 107.5,
|
|
"entropy": 1.1992060840129852,
|
|
"epoch": 0.0017777777777777779,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.6052725065567501,
|
|
"kl": 0.004966819164110348,
|
|
"learning_rate": 4.6717793412953776e-07,
|
|
"loss": 0.17,
|
|
"num_tokens": 416601.0,
|
|
"reward": 0.578125,
|
|
"reward_std": 0.5266626179218292,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.1767766922712326,
|
|
"rewards/format_reward_func/mean": 0.546875,
|
|
"rewards/format_reward_func/std": 0.4994383603334427,
|
|
"sampling/importance_sampling_ratio/max": 2.251875877380371,
|
|
"sampling/importance_sampling_ratio/mean": 0.7426749467849731,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.4947061240673065,
|
|
"sampling/sampling_logp_difference/mean": 0.024829605594277382,
|
|
"step": 20,
|
|
"step_time": 11.085346231702715
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 931.5,
|
|
"completions/max_terminated_length": 833.5,
|
|
"completions/mean_length": 474.609375,
|
|
"completions/mean_terminated_length": 465.7101745605469,
|
|
"completions/min_length": 155.5,
|
|
"completions/min_terminated_length": 155.5,
|
|
"entropy": 1.189589723944664,
|
|
"epoch": 0.0019555555555555554,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.568319079842556,
|
|
"kl": 0.005503996828338131,
|
|
"learning_rate": 4.5870701325731773e-07,
|
|
"loss": 0.1169,
|
|
"num_tokens": 456736.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.5569138079881668,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.12296734005212784,
|
|
"rewards/format_reward_func/mean": 0.5625,
|
|
"rewards/format_reward_func/std": 0.5029991269111633,
|
|
"sampling/importance_sampling_ratio/max": 2.329828381538391,
|
|
"sampling/importance_sampling_ratio/mean": 0.7178902626037598,
|
|
"sampling/importance_sampling_ratio/min": 0.01708686724305153,
|
|
"sampling/sampling_logp_difference/max": 0.7085881531238556,
|
|
"sampling/sampling_logp_difference/mean": 0.023770778439939022,
|
|
"step": 22,
|
|
"step_time": 11.324711623368785
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 821.0,
|
|
"completions/mean_length": 485.90625,
|
|
"completions/mean_terminated_length": 468.5483856201172,
|
|
"completions/min_length": 174.0,
|
|
"completions/min_terminated_length": 174.0,
|
|
"entropy": 1.2347300872206688,
|
|
"epoch": 0.0021333333333333334,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.5103693400119262,
|
|
"kl": 0.0056485196109861135,
|
|
"learning_rate": 4.4936070264068016e-07,
|
|
"loss": 0.0751,
|
|
"num_tokens": 497594.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.5729349255561829,
|
|
"rewards/equation_reward_func/mean": 0.0625,
|
|
"rewards/equation_reward_func/std": 0.2364606335759163,
|
|
"rewards/format_reward_func/mean": 0.59375,
|
|
"rewards/format_reward_func/std": 0.49899089336395264,
|
|
"sampling/importance_sampling_ratio/max": 2.513130784034729,
|
|
"sampling/importance_sampling_ratio/mean": 0.6623396277427673,
|
|
"sampling/importance_sampling_ratio/min": 0.052559204399585724,
|
|
"sampling/sampling_logp_difference/max": 0.6741056442260742,
|
|
"sampling/sampling_logp_difference/mean": 0.024482053704559803,
|
|
"step": 24,
|
|
"step_time": 11.668942423537374
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 902.5,
|
|
"completions/max_terminated_length": 803.0,
|
|
"completions/mean_length": 479.09375,
|
|
"completions/mean_terminated_length": 469.92791748046875,
|
|
"completions/min_length": 175.5,
|
|
"completions/min_terminated_length": 175.5,
|
|
"entropy": 1.2349426448345184,
|
|
"epoch": 0.002311111111111111,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.3908582983128641,
|
|
"kl": 0.008100319362711161,
|
|
"learning_rate": 4.391782039544238e-07,
|
|
"loss": 0.0008,
|
|
"num_tokens": 538080.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.532937303185463,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.21135568618774414,
|
|
"rewards/format_reward_func/mean": 0.625,
|
|
"rewards/format_reward_func/std": 0.4874725937843323,
|
|
"sampling/importance_sampling_ratio/max": 1.898095965385437,
|
|
"sampling/importance_sampling_ratio/mean": 0.6496416926383972,
|
|
"sampling/importance_sampling_ratio/min": 0.020795006304979324,
|
|
"sampling/sampling_logp_difference/max": 0.6364502906799316,
|
|
"sampling/sampling_logp_difference/mean": 0.024783543311059475,
|
|
"step": 26,
|
|
"step_time": 11.404050085111521
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 845.0,
|
|
"completions/mean_length": 475.859375,
|
|
"completions/mean_terminated_length": 458.1773986816406,
|
|
"completions/min_length": 208.0,
|
|
"completions/min_terminated_length": 208.0,
|
|
"entropy": 1.2003328688442707,
|
|
"epoch": 0.002488888888888889,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.138678539986136,
|
|
"kl": 0.007092042971635237,
|
|
"learning_rate": 4.282022261367073e-07,
|
|
"loss": -0.0762,
|
|
"num_tokens": 578223.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.5117992758750916,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.12296734005212784,
|
|
"rewards/format_reward_func/mean": 0.703125,
|
|
"rewards/format_reward_func/std": 0.4638662487268448,
|
|
"sampling/importance_sampling_ratio/max": 2.868867874145508,
|
|
"sampling/importance_sampling_ratio/mean": 0.8748505115509033,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.386928454041481,
|
|
"sampling/sampling_logp_difference/mean": 0.02374311164021492,
|
|
"step": 28,
|
|
"step_time": 12.464717164519243
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 950.5,
|
|
"completions/max_terminated_length": 844.0,
|
|
"completions/mean_length": 486.171875,
|
|
"completions/mean_terminated_length": 477.2635955810547,
|
|
"completions/min_length": 157.5,
|
|
"completions/min_terminated_length": 157.5,
|
|
"entropy": 1.1861281506717205,
|
|
"epoch": 0.0026666666666666666,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0375770292654916,
|
|
"kl": 0.009247849928215146,
|
|
"learning_rate": 4.1647880625292027e-07,
|
|
"loss": -0.0282,
|
|
"num_tokens": 619074.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.5395806133747101,
|
|
"rewards/equation_reward_func/mean": 0.0625,
|
|
"rewards/equation_reward_func/std": 0.16800537705421448,
|
|
"rewards/format_reward_func/mean": 0.71875,
|
|
"rewards/format_reward_func/std": 0.45128606259822845,
|
|
"sampling/importance_sampling_ratio/max": 2.3657628297805786,
|
|
"sampling/importance_sampling_ratio/mean": 0.7253041863441467,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.46275559067726135,
|
|
"sampling/sampling_logp_difference/mean": 0.023301721550524235,
|
|
"step": 30,
|
|
"step_time": 13.156236473936588
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 847.5,
|
|
"completions/max_terminated_length": 847.5,
|
|
"completions/mean_length": 408.296875,
|
|
"completions/mean_terminated_length": 408.296875,
|
|
"completions/min_length": 180.5,
|
|
"completions/min_terminated_length": 180.5,
|
|
"entropy": 1.1347957476973534,
|
|
"epoch": 0.0028444444444444446,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.3739216172676234,
|
|
"kl": 0.011383652017684653,
|
|
"learning_rate": 4.040571164002318e-07,
|
|
"loss": 0.2237,
|
|
"num_tokens": 654981.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.5548828095197678,
|
|
"rewards/equation_reward_func/mean": 0.109375,
|
|
"rewards/equation_reward_func/std": 0.3074183538556099,
|
|
"rewards/format_reward_func/mean": 0.75,
|
|
"rewards/format_reward_func/std": 0.43840841948986053,
|
|
"sampling/importance_sampling_ratio/max": 2.9178740978240967,
|
|
"sampling/importance_sampling_ratio/mean": 0.8742310702800751,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6507634520530701,
|
|
"sampling/sampling_logp_difference/mean": 0.02339162491261959,
|
|
"step": 32,
|
|
"step_time": 11.533732229378074
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 873.0,
|
|
"completions/max_terminated_length": 873.0,
|
|
"completions/mean_length": 459.3125,
|
|
"completions/mean_terminated_length": 459.3125,
|
|
"completions/min_length": 162.0,
|
|
"completions/min_terminated_length": 162.0,
|
|
"entropy": 1.1569931730628014,
|
|
"epoch": 0.003022222222222222,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.6685795867340696,
|
|
"kl": 0.008567514276364818,
|
|
"learning_rate": 3.909892574627266e-07,
|
|
"loss": 0.1375,
|
|
"num_tokens": 694097.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.5303300619125366,
|
|
"rewards/equation_reward_func/mean": 0.140625,
|
|
"rewards/equation_reward_func/std": 0.34635117650032043,
|
|
"rewards/format_reward_func/mean": 0.859375,
|
|
"rewards/format_reward_func/std": 0.34635117650032043,
|
|
"sampling/importance_sampling_ratio/max": 2.5157090425491333,
|
|
"sampling/importance_sampling_ratio/mean": 0.7955585420131683,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.4740889072418213,
|
|
"sampling/sampling_logp_difference/mean": 0.02304172795265913,
|
|
"step": 34,
|
|
"step_time": 10.761377388611436
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 986.0,
|
|
"completions/max_terminated_length": 986.0,
|
|
"completions/mean_length": 480.90625,
|
|
"completions/mean_terminated_length": 480.90625,
|
|
"completions/min_length": 162.5,
|
|
"completions/min_terminated_length": 162.5,
|
|
"entropy": 1.2352817580103874,
|
|
"epoch": 0.0032,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.1771067362481362,
|
|
"kl": 0.010575773718301207,
|
|
"learning_rate": 3.773300405821908e-07,
|
|
"loss": -0.0276,
|
|
"num_tokens": 734619.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.5194454491138458,
|
|
"rewards/equation_reward_func/mean": 0.09375,
|
|
"rewards/equation_reward_func/std": 0.19827888906002045,
|
|
"rewards/format_reward_func/mean": 0.75,
|
|
"rewards/format_reward_func/std": 0.4337434321641922,
|
|
"sampling/importance_sampling_ratio/max": 2.6484490633010864,
|
|
"sampling/importance_sampling_ratio/mean": 0.782801479101181,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.4273035526275635,
|
|
"sampling/sampling_logp_difference/mean": 0.024281970225274563,
|
|
"step": 36,
|
|
"step_time": 11.392230691039003
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 886.5,
|
|
"completions/max_terminated_length": 880.5,
|
|
"completions/mean_length": 444.296875,
|
|
"completions/mean_terminated_length": 436.0539245605469,
|
|
"completions/min_length": 162.5,
|
|
"completions/min_terminated_length": 162.5,
|
|
"entropy": 1.15547876060009,
|
|
"epoch": 0.0033777777777777777,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 3.7113272904633505,
|
|
"kl": 0.01313028542790562,
|
|
"learning_rate": 3.6313675726113475e-07,
|
|
"loss": 0.066,
|
|
"num_tokens": 772838.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.40442168712615967,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.1480722874403,
|
|
"rewards/format_reward_func/mean": 0.875,
|
|
"rewards/format_reward_func/std": 0.33252330124378204,
|
|
"sampling/importance_sampling_ratio/max": 2.566834807395935,
|
|
"sampling/importance_sampling_ratio/mean": 0.829992413520813,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5897403955459595,
|
|
"sampling/sampling_logp_difference/mean": 0.023365739732980728,
|
|
"step": 38,
|
|
"step_time": 10.986438499065116
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 890.0,
|
|
"completions/mean_length": 527.90625,
|
|
"completions/mean_terminated_length": 503.8693542480469,
|
|
"completions/min_length": 215.0,
|
|
"completions/min_terminated_length": 215.0,
|
|
"entropy": 1.2300068363547325,
|
|
"epoch": 0.0035555555555555557,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.6216510890549307,
|
|
"kl": 0.01843011923483573,
|
|
"learning_rate": 3.484689390623218e-07,
|
|
"loss": 0.0636,
|
|
"num_tokens": 816408.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.4437006711959839,
|
|
"rewards/equation_reward_func/mean": 0.015625,
|
|
"rewards/equation_reward_func/std": 0.0883883461356163,
|
|
"rewards/format_reward_func/mean": 0.78125,
|
|
"rewards/format_reward_func/std": 0.41824956238269806,
|
|
"sampling/importance_sampling_ratio/max": 2.566046714782715,
|
|
"sampling/importance_sampling_ratio/mean": 0.8348058462142944,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 1.2292534112930298,
|
|
"sampling/sampling_logp_difference/mean": 0.0241514528170228,
|
|
"step": 40,
|
|
"step_time": 11.432389895198867
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 916.5,
|
|
"completions/max_terminated_length": 799.0,
|
|
"completions/mean_length": 489.21875,
|
|
"completions/mean_terminated_length": 480.28125,
|
|
"completions/min_length": 213.0,
|
|
"completions/min_terminated_length": 213.0,
|
|
"entropy": 1.2188931107521057,
|
|
"epoch": 0.0037333333333333333,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9636777693135964,
|
|
"kl": 0.01053301602951251,
|
|
"learning_rate": 3.3338810791270517e-07,
|
|
"loss": 0.0198,
|
|
"num_tokens": 857494.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.4847814738750458,
|
|
"rewards/equation_reward_func/mean": 0.078125,
|
|
"rewards/equation_reward_func/std": 0.2710396274924278,
|
|
"rewards/format_reward_func/mean": 0.84375,
|
|
"rewards/format_reward_func/std": 0.3689020276069641,
|
|
"sampling/importance_sampling_ratio/max": 1.9368165135383606,
|
|
"sampling/importance_sampling_ratio/mean": 0.5968808531761169,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6993489265441895,
|
|
"sampling/sampling_logp_difference/mean": 0.02366031240671873,
|
|
"step": 42,
|
|
"step_time": 11.470696586417034
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 1009.5,
|
|
"completions/max_terminated_length": 985.0,
|
|
"completions/mean_length": 440.140625,
|
|
"completions/mean_terminated_length": 431.2227783203125,
|
|
"completions/min_length": 168.0,
|
|
"completions/min_terminated_length": 168.0,
|
|
"entropy": 1.1604873463511467,
|
|
"epoch": 0.003911111111111111,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.2045214320430864,
|
|
"kl": 0.010937573766568676,
|
|
"learning_rate": 3.179575180590857e-07,
|
|
"loss": -0.0545,
|
|
"num_tokens": 895415.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.43845126032829285,
|
|
"rewards/equation_reward_func/mean": 0.078125,
|
|
"rewards/equation_reward_func/std": 0.18445101380348206,
|
|
"rewards/format_reward_func/mean": 0.84375,
|
|
"rewards/format_reward_func/std": 0.3689020276069641,
|
|
"sampling/importance_sampling_ratio/max": 2.7295889854431152,
|
|
"sampling/importance_sampling_ratio/mean": 0.689439982175827,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5454497933387756,
|
|
"sampling/sampling_logp_difference/mean": 0.023939015343785286,
|
|
"step": 44,
|
|
"step_time": 11.862673799390905
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 985.5,
|
|
"completions/mean_length": 501.640625,
|
|
"completions/mean_terminated_length": 466.8166809082031,
|
|
"completions/min_length": 209.5,
|
|
"completions/min_terminated_length": 209.5,
|
|
"entropy": 1.2363386303186417,
|
|
"epoch": 0.004088888888888889,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.7502632857924376,
|
|
"kl": 0.008666158799314871,
|
|
"learning_rate": 3.022418907578188e-07,
|
|
"loss": 0.0883,
|
|
"num_tokens": 937352.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.49288448691368103,
|
|
"rewards/equation_reward_func/mean": 0.09375,
|
|
"rewards/equation_reward_func/std": 0.27283935993909836,
|
|
"rewards/format_reward_func/mean": 0.84375,
|
|
"rewards/format_reward_func/std": 0.3662842661142349,
|
|
"sampling/importance_sampling_ratio/max": 2.052061378955841,
|
|
"sampling/importance_sampling_ratio/mean": 0.7052308619022369,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.9969043135643005,
|
|
"sampling/sampling_logp_difference/mean": 0.02375571522861719,
|
|
"step": 46,
|
|
"step_time": 11.741644776426256
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 876.0,
|
|
"completions/max_terminated_length": 876.0,
|
|
"completions/mean_length": 458.84375,
|
|
"completions/mean_terminated_length": 458.84375,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"entropy": 1.1406226977705956,
|
|
"epoch": 0.004266666666666667,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 2.1569663279164213,
|
|
"kl": 0.011335209826938808,
|
|
"learning_rate": 2.863071428113726e-07,
|
|
"loss": 0.0549,
|
|
"num_tokens": 976550.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.458977147936821,
|
|
"rewards/equation_reward_func/mean": 0.078125,
|
|
"rewards/equation_reward_func/std": 0.2563937231898308,
|
|
"rewards/format_reward_func/mean": 0.859375,
|
|
"rewards/format_reward_func/std": 0.34635117650032043,
|
|
"sampling/importance_sampling_ratio/max": 2.231783628463745,
|
|
"sampling/importance_sampling_ratio/mean": 0.7619136869907379,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5767736434936523,
|
|
"sampling/sampling_logp_difference/mean": 0.023336266167461872,
|
|
"step": 48,
|
|
"step_time": 11.434072194388136
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 920.5,
|
|
"completions/max_terminated_length": 920.5,
|
|
"completions/mean_length": 459.34375,
|
|
"completions/mean_terminated_length": 459.34375,
|
|
"completions/min_length": 156.0,
|
|
"completions/min_terminated_length": 156.0,
|
|
"entropy": 1.2100868672132492,
|
|
"epoch": 0.0044444444444444444,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.728209453317298,
|
|
"kl": 0.01852725149365142,
|
|
"learning_rate": 2.7022011009035107e-07,
|
|
"loss": 0.1187,
|
|
"num_tokens": 1015740.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.4356408715248108,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.21135568618774414,
|
|
"rewards/format_reward_func/mean": 0.84375,
|
|
"rewards/format_reward_func/std": 0.3580790013074875,
|
|
"sampling/importance_sampling_ratio/max": 2.3857074975967407,
|
|
"sampling/importance_sampling_ratio/mean": 0.827456921339035,
|
|
"sampling/importance_sampling_ratio/min": 0.08825718238949776,
|
|
"sampling/sampling_logp_difference/max": 0.6624305248260498,
|
|
"sampling/sampling_logp_difference/mean": 0.023832999169826508,
|
|
"step": 50,
|
|
"step_time": 11.557595945429057
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 909.0,
|
|
"completions/max_terminated_length": 836.0,
|
|
"completions/mean_length": 452.5,
|
|
"completions/mean_terminated_length": 443.2046356201172,
|
|
"completions/min_length": 201.0,
|
|
"completions/min_terminated_length": 201.0,
|
|
"entropy": 1.1419423446059227,
|
|
"epoch": 0.004622222222222222,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.6088939390705712,
|
|
"kl": 0.012844312441302463,
|
|
"learning_rate": 2.540482672006254e-07,
|
|
"loss": -0.0029,
|
|
"num_tokens": 1054428.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.46261392533779144,
|
|
"rewards/equation_reward_func/mean": 0.09375,
|
|
"rewards/equation_reward_func/std": 0.27283935993909836,
|
|
"rewards/format_reward_func/mean": 0.875,
|
|
"rewards/format_reward_func/std": 0.33252330124378204,
|
|
"sampling/importance_sampling_ratio/max": 2.2242526412010193,
|
|
"sampling/importance_sampling_ratio/mean": 0.7612048387527466,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6042303442955017,
|
|
"sampling/sampling_logp_difference/mean": 0.023195499554276466,
|
|
"step": 52,
|
|
"step_time": 11.590359390014783
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 817.0,
|
|
"completions/mean_length": 422.1875,
|
|
"completions/mean_terminated_length": 402.77418518066406,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"entropy": 1.0865775719285011,
|
|
"epoch": 0.0048,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.370478804799173,
|
|
"kl": 0.012116663216147572,
|
|
"learning_rate": 2.37859444471388e-07,
|
|
"loss": 0.0846,
|
|
"num_tokens": 1091144.0,
|
|
"reward": 1.109375,
|
|
"reward_std": 0.47186581790447235,
|
|
"rewards/equation_reward_func/mean": 0.171875,
|
|
"rewards/equation_reward_func/std": 0.378012090921402,
|
|
"rewards/format_reward_func/mean": 0.9375,
|
|
"rewards/format_reward_func/std": 0.2364606335759163,
|
|
"sampling/importance_sampling_ratio/max": 2.3138152956962585,
|
|
"sampling/importance_sampling_ratio/mean": 0.7851577699184418,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5188883543014526,
|
|
"sampling/sampling_logp_difference/mean": 0.022715235128998756,
|
|
"step": 54,
|
|
"step_time": 13.548070080811158
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1007.0,
|
|
"completions/max_terminated_length": 1007.0,
|
|
"completions/mean_length": 452.09375,
|
|
"completions/mean_terminated_length": 452.09375,
|
|
"completions/min_length": 160.5,
|
|
"completions/min_terminated_length": 160.5,
|
|
"entropy": 1.1417308785021305,
|
|
"epoch": 0.004977777777777778,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 1.4286278134907313,
|
|
"kl": 0.012324006151175126,
|
|
"learning_rate": 2.2172154345117894e-07,
|
|
"loss": 0.0483,
|
|
"num_tokens": 1129814.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.30280280113220215,
|
|
"rewards/equation_reward_func/mean": 0.015625,
|
|
"rewards/equation_reward_func/std": 0.0883883461356163,
|
|
"rewards/format_reward_func/mean": 0.921875,
|
|
"rewards/format_reward_func/std": 0.2710396274924278,
|
|
"sampling/importance_sampling_ratio/max": 2.3811429738998413,
|
|
"sampling/importance_sampling_ratio/mean": 0.8520744442939758,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6289049386978149,
|
|
"sampling/sampling_logp_difference/mean": 0.02345012128353119,
|
|
"step": 56,
|
|
"step_time": 12.447777467081323
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 931.0,
|
|
"completions/max_terminated_length": 871.0,
|
|
"completions/mean_length": 443.71875,
|
|
"completions/mean_terminated_length": 434.61692810058594,
|
|
"completions/min_length": 156.0,
|
|
"completions/min_terminated_length": 156.0,
|
|
"entropy": 1.1414429359138012,
|
|
"epoch": 0.005155555555555556,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.63905040062684,
|
|
"kl": 0.018355112959397957,
|
|
"learning_rate": 2.0570225210519433e-07,
|
|
"loss": 0.0939,
|
|
"num_tokens": 1167916.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.4889104962348938,
|
|
"rewards/equation_reward_func/mean": 0.109375,
|
|
"rewards/equation_reward_func/std": 0.31607766449451447,
|
|
"rewards/format_reward_func/mean": 0.875,
|
|
"rewards/format_reward_func/std": 0.33252330124378204,
|
|
"sampling/importance_sampling_ratio/max": 1.8928932547569275,
|
|
"sampling/importance_sampling_ratio/mean": 0.7523285746574402,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5896574258804321,
|
|
"sampling/sampling_logp_difference/mean": 0.02342431340366602,
|
|
"step": 58,
|
|
"step_time": 10.975074479705654
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 958.5,
|
|
"completions/max_terminated_length": 940.5,
|
|
"completions/mean_length": 453.6875,
|
|
"completions/mean_terminated_length": 444.2217712402344,
|
|
"completions/min_length": 161.0,
|
|
"completions/min_terminated_length": 161.0,
|
|
"entropy": 1.136627346277237,
|
|
"epoch": 0.005333333333333333,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.401252124600878,
|
|
"kl": 0.010728320659836754,
|
|
"learning_rate": 1.8986876090843664e-07,
|
|
"loss": 0.0514,
|
|
"num_tokens": 1206712.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.346970796585083,
|
|
"rewards/equation_reward_func/mean": 0.0625,
|
|
"rewards/equation_reward_func/std": 0.2364606335759163,
|
|
"rewards/format_reward_func/mean": 0.9375,
|
|
"rewards/format_reward_func/std": 0.2364606335759163,
|
|
"sampling/importance_sampling_ratio/max": 2.603978991508484,
|
|
"sampling/importance_sampling_ratio/mean": 0.6841319799423218,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5754774510860443,
|
|
"sampling/sampling_logp_difference/mean": 0.02338473778218031,
|
|
"step": 60,
|
|
"step_time": 11.381625101552345
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 881.0,
|
|
"completions/mean_length": 465.359375,
|
|
"completions/mean_terminated_length": 447.3386993408203,
|
|
"completions/min_length": 151.5,
|
|
"completions/min_terminated_length": 151.5,
|
|
"entropy": 1.1641199924051762,
|
|
"epoch": 0.005511111111111111,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.0389216878895204,
|
|
"kl": 0.012553387088701129,
|
|
"learning_rate": 1.7428748102551234e-07,
|
|
"loss": -0.0206,
|
|
"num_tokens": 1246335.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.42767396569252014,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.12296734005212784,
|
|
"rewards/format_reward_func/mean": 0.828125,
|
|
"rewards/format_reward_func/std": 0.378012090921402,
|
|
"sampling/importance_sampling_ratio/max": 2.5562033653259277,
|
|
"sampling/importance_sampling_ratio/mean": 0.6826831996440887,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.546184629201889,
|
|
"sampling/sampling_logp_difference/mean": 0.02384145464748144,
|
|
"step": 62,
|
|
"step_time": 11.716152066364884
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 873.0,
|
|
"completions/max_terminated_length": 873.0,
|
|
"completions/mean_length": 472.4375,
|
|
"completions/mean_terminated_length": 472.4375,
|
|
"completions/min_length": 165.0,
|
|
"completions/min_terminated_length": 165.0,
|
|
"entropy": 1.1974952705204487,
|
|
"epoch": 0.005688888888888889,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 0.8405866770711337,
|
|
"kl": 0.011443487601354718,
|
|
"learning_rate": 1.5902376575912814e-07,
|
|
"loss": 0.0695,
|
|
"num_tokens": 1286315.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.43038569390773773,
|
|
"rewards/equation_reward_func/mean": 0.09375,
|
|
"rewards/equation_reward_func/std": 0.19827888906002045,
|
|
"rewards/format_reward_func/mean": 0.890625,
|
|
"rewards/format_reward_func/std": 0.3074183538556099,
|
|
"sampling/importance_sampling_ratio/max": 2.7265862226486206,
|
|
"sampling/importance_sampling_ratio/mean": 0.8386669158935547,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.4937933683395386,
|
|
"sampling/sampling_logp_difference/mean": 0.024114561267197132,
|
|
"step": 64,
|
|
"step_time": 11.237366109970026
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 970.5,
|
|
"completions/max_terminated_length": 838.5,
|
|
"completions/mean_length": 450.359375,
|
|
"completions/mean_terminated_length": 441.2227783203125,
|
|
"completions/min_length": 193.0,
|
|
"completions/min_terminated_length": 193.0,
|
|
"entropy": 1.1044728867709637,
|
|
"epoch": 0.005866666666666667,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.423044848014041,
|
|
"kl": 0.010465700703207403,
|
|
"learning_rate": 1.4414163643562753e-07,
|
|
"loss": 0.2456,
|
|
"num_tokens": 1324906.0,
|
|
"reward": 1.0625,
|
|
"reward_std": 0.49249379336833954,
|
|
"rewards/equation_reward_func/mean": 0.15625,
|
|
"rewards/equation_reward_func/std": 0.3662842661142349,
|
|
"rewards/format_reward_func/mean": 0.90625,
|
|
"rewards/format_reward_func/std": 0.27283935993909836,
|
|
"sampling/importance_sampling_ratio/max": 2.4483842849731445,
|
|
"sampling/importance_sampling_ratio/mean": 0.8248867988586426,
|
|
"sampling/importance_sampling_ratio/min": 0.05962574481964111,
|
|
"sampling/sampling_logp_difference/max": 0.7402658462524414,
|
|
"sampling/sampling_logp_difference/mean": 0.022917624562978745,
|
|
"step": 66,
|
|
"step_time": 11.092585265287198
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 834.5,
|
|
"completions/max_terminated_length": 834.5,
|
|
"completions/mean_length": 434.6875,
|
|
"completions/mean_terminated_length": 434.6875,
|
|
"completions/min_length": 175.0,
|
|
"completions/min_terminated_length": 175.0,
|
|
"entropy": 1.1426044255495071,
|
|
"epoch": 0.006044444444444444,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 2.225802835067467,
|
|
"kl": 0.013302074105013162,
|
|
"learning_rate": 1.2970351387729872e-07,
|
|
"loss": 0.027,
|
|
"num_tokens": 1362414.0,
|
|
"reward": 1.015625,
|
|
"reward_std": 0.3343358188867569,
|
|
"rewards/equation_reward_func/mean": 0.0625,
|
|
"rewards/equation_reward_func/std": 0.24593468010425568,
|
|
"rewards/format_reward_func/mean": 0.953125,
|
|
"rewards/format_reward_func/std": 0.21135568618774414,
|
|
"sampling/importance_sampling_ratio/max": 2.4642796516418457,
|
|
"sampling/importance_sampling_ratio/mean": 0.7873663902282715,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.51364666223526,
|
|
"sampling/sampling_logp_difference/mean": 0.023219610564410686,
|
|
"step": 68,
|
|
"step_time": 10.780604753526859
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 959.5,
|
|
"completions/mean_length": 473.90625,
|
|
"completions/mean_terminated_length": 456.1612854003906,
|
|
"completions/min_length": 175.5,
|
|
"completions/min_terminated_length": 175.5,
|
|
"entropy": 1.1317069344222546,
|
|
"epoch": 0.006222222222222222,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 0.8320287770406333,
|
|
"kl": 0.009612397610908374,
|
|
"learning_rate": 1.1576995658775404e-07,
|
|
"loss": 0.1579,
|
|
"num_tokens": 1402544.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.3952517956495285,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.21135568618774414,
|
|
"rewards/format_reward_func/mean": 0.890625,
|
|
"rewards/format_reward_func/std": 0.31607766449451447,
|
|
"sampling/importance_sampling_ratio/max": 2.471542716026306,
|
|
"sampling/importance_sampling_ratio/mean": 0.7965124845504761,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6067328453063965,
|
|
"sampling/sampling_logp_difference/mean": 0.023130498826503754,
|
|
"step": 70,
|
|
"step_time": 11.402438224526122
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 980.5,
|
|
"completions/max_terminated_length": 980.5,
|
|
"completions/mean_length": 444.0625,
|
|
"completions/mean_terminated_length": 444.0625,
|
|
"completions/min_length": 204.0,
|
|
"completions/min_terminated_length": 204.0,
|
|
"entropy": 1.1372172087430954,
|
|
"epoch": 0.0064,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 2.599974610451281,
|
|
"kl": 0.018978118430823088,
|
|
"learning_rate": 1.0239940674851941e-07,
|
|
"loss": 0.1768,
|
|
"num_tokens": 1440692.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.3873825669288635,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.1767766922712326,
|
|
"rewards/format_reward_func/mean": 0.875,
|
|
"rewards/format_reward_func/std": 0.33252330124378204,
|
|
"sampling/importance_sampling_ratio/max": 2.7498836517333984,
|
|
"sampling/importance_sampling_ratio/mean": 0.9847005903720856,
|
|
"sampling/importance_sampling_ratio/min": 0.034967128187417984,
|
|
"sampling/sampling_logp_difference/max": 0.48557257652282715,
|
|
"sampling/sampling_logp_difference/mean": 0.023576208390295506,
|
|
"step": 72,
|
|
"step_time": 11.201861241133884
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 806.5,
|
|
"completions/mean_length": 495.859375,
|
|
"completions/mean_terminated_length": 469.6763458251953,
|
|
"completions/min_length": 194.5,
|
|
"completions/min_terminated_length": 194.5,
|
|
"entropy": 1.099432535469532,
|
|
"epoch": 0.006577777777777778,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 0.84127080621119,
|
|
"kl": 0.010809146391693503,
|
|
"learning_rate": 8.964794509221507e-08,
|
|
"loss": 0.006,
|
|
"num_tokens": 1482227.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.4278488904237747,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.21135568618774414,
|
|
"rewards/format_reward_func/mean": 0.859375,
|
|
"rewards/format_reward_func/std": 0.35245639085769653,
|
|
"sampling/importance_sampling_ratio/max": 2.2808090448379517,
|
|
"sampling/importance_sampling_ratio/mean": 0.7656073570251465,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6886538863182068,
|
|
"sampling/sampling_logp_difference/mean": 0.022277969866991043,
|
|
"step": 74,
|
|
"step_time": 11.536407661740668
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 982.0,
|
|
"completions/max_terminated_length": 961.5,
|
|
"completions/mean_length": 452.0625,
|
|
"completions/mean_terminated_length": 443.0055389404297,
|
|
"completions/min_length": 176.5,
|
|
"completions/min_terminated_length": 176.5,
|
|
"entropy": 1.1252660602331161,
|
|
"epoch": 0.0067555555555555554,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 1.0980278842121856,
|
|
"kl": 0.012123662454541773,
|
|
"learning_rate": 7.756905568047392e-08,
|
|
"loss": -0.0284,
|
|
"num_tokens": 1520855.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.37185215950012207,
|
|
"rewards/equation_reward_func/mean": 0.0625,
|
|
"rewards/equation_reward_func/std": 0.2364606335759163,
|
|
"rewards/format_reward_func/mean": 0.921875,
|
|
"rewards/format_reward_func/std": 0.2563937231898308,
|
|
"sampling/importance_sampling_ratio/max": 2.532285451889038,
|
|
"sampling/importance_sampling_ratio/mean": 0.7447502017021179,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.8389689922332764,
|
|
"sampling/sampling_logp_difference/mean": 0.022948664613068104,
|
|
"step": 76,
|
|
"step_time": 11.632032112334855
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 937.0,
|
|
"completions/max_terminated_length": 863.5,
|
|
"completions/mean_length": 457.578125,
|
|
"completions/mean_terminated_length": 449.3684387207031,
|
|
"completions/min_length": 199.5,
|
|
"completions/min_terminated_length": 199.5,
|
|
"entropy": 1.1281827799975872,
|
|
"epoch": 0.006933333333333333,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 0.7703809090962176,
|
|
"kl": 0.011204674287000671,
|
|
"learning_rate": 6.621340157319996e-08,
|
|
"loss": 0.025,
|
|
"num_tokens": 1559956.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.26799365133047104,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.12296734005212784,
|
|
"rewards/format_reward_func/mean": 0.953125,
|
|
"rewards/format_reward_func/std": 0.21135568618774414,
|
|
"sampling/importance_sampling_ratio/max": 2.72747004032135,
|
|
"sampling/importance_sampling_ratio/mean": 0.7423470318317413,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5065812468528748,
|
|
"sampling/sampling_logp_difference/mean": 0.023134512826800346,
|
|
"step": 78,
|
|
"step_time": 11.912406253744848
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 917.5,
|
|
"completions/max_terminated_length": 906.0,
|
|
"completions/mean_length": 449.234375,
|
|
"completions/mean_terminated_length": 439.91380310058594,
|
|
"completions/min_length": 168.5,
|
|
"completions/min_terminated_length": 168.5,
|
|
"entropy": 1.1397205702960491,
|
|
"epoch": 0.0071111111111111115,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.157999149874959,
|
|
"kl": 0.010615026520099491,
|
|
"learning_rate": 5.5628612330087724e-08,
|
|
"loss": 0.1383,
|
|
"num_tokens": 1598475.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.348248615860939,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.1480722874403,
|
|
"rewards/format_reward_func/mean": 0.921875,
|
|
"rewards/format_reward_func/std": 0.2710396274924278,
|
|
"sampling/importance_sampling_ratio/max": 2.596957802772522,
|
|
"sampling/importance_sampling_ratio/mean": 0.781058132648468,
|
|
"sampling/importance_sampling_ratio/min": 0.047918595373630524,
|
|
"sampling/sampling_logp_difference/max": 0.6418575942516327,
|
|
"sampling/sampling_logp_difference/mean": 0.023468288592994213,
|
|
"step": 80,
|
|
"step_time": 13.718183192540891
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 944.5,
|
|
"completions/mean_length": 480.703125,
|
|
"completions/mean_terminated_length": 453.83656311035156,
|
|
"completions/min_length": 184.5,
|
|
"completions/min_terminated_length": 184.5,
|
|
"entropy": 1.099193848669529,
|
|
"epoch": 0.007288888888888889,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 0.6986674805632416,
|
|
"kl": 0.01772310130763799,
|
|
"learning_rate": 4.5859084235697235e-08,
|
|
"loss": 0.0228,
|
|
"num_tokens": 1639032.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.3549068421125412,
|
|
"rewards/equation_reward_func/mean": 0.0625,
|
|
"rewards/equation_reward_func/std": 0.2364606335759163,
|
|
"rewards/format_reward_func/mean": 0.9375,
|
|
"rewards/format_reward_func/std": 0.24593468010425568,
|
|
"sampling/importance_sampling_ratio/max": 2.1860596537590027,
|
|
"sampling/importance_sampling_ratio/mean": 0.8112323880195618,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.4584404528141022,
|
|
"sampling/sampling_logp_difference/mean": 0.022659837268292904,
|
|
"step": 82,
|
|
"step_time": 12.119231592281722
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1011.5,
|
|
"completions/max_terminated_length": 1011.5,
|
|
"completions/mean_length": 462.8125,
|
|
"completions/mean_terminated_length": 462.8125,
|
|
"completions/min_length": 165.5,
|
|
"completions/min_terminated_length": 165.5,
|
|
"entropy": 1.1106774359941483,
|
|
"epoch": 0.007466666666666667,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 0.4259301507183561,
|
|
"kl": 0.010471002315171063,
|
|
"learning_rate": 3.6945794086007705e-08,
|
|
"loss": -0.0222,
|
|
"num_tokens": 1678484.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.27769785374403,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.12296734005212784,
|
|
"rewards/format_reward_func/mean": 0.953125,
|
|
"rewards/format_reward_func/std": 0.21135568618774414,
|
|
"sampling/importance_sampling_ratio/max": 2.785693407058716,
|
|
"sampling/importance_sampling_ratio/mean": 0.7308302521705627,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5246871709823608,
|
|
"sampling/sampling_logp_difference/mean": 0.023393068462610245,
|
|
"step": 84,
|
|
"step_time": 11.7816762131406
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 868.5,
|
|
"completions/mean_length": 492.796875,
|
|
"completions/mean_terminated_length": 457.38336181640625,
|
|
"completions/min_length": 166.0,
|
|
"completions/min_terminated_length": 166.0,
|
|
"entropy": 1.1102127395570278,
|
|
"epoch": 0.007644444444444444,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.0147696038970972,
|
|
"kl": 0.013029435416683555,
|
|
"learning_rate": 2.892612731749414e-08,
|
|
"loss": 0.1513,
|
|
"num_tokens": 1719751.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.4890725910663605,
|
|
"rewards/equation_reward_func/mean": 0.109375,
|
|
"rewards/equation_reward_func/std": 0.31607766449451447,
|
|
"rewards/format_reward_func/mean": 0.84375,
|
|
"rewards/format_reward_func/std": 0.3689020276069641,
|
|
"sampling/importance_sampling_ratio/max": 2.4325342178344727,
|
|
"sampling/importance_sampling_ratio/mean": 0.8387036323547363,
|
|
"sampling/importance_sampling_ratio/min": 0.044968899339437485,
|
|
"sampling/sampling_logp_difference/max": 0.5210357308387756,
|
|
"sampling/sampling_logp_difference/mean": 0.023624008521437645,
|
|
"step": 86,
|
|
"step_time": 11.889815866597928
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 885.0,
|
|
"completions/max_terminated_length": 885.0,
|
|
"completions/mean_length": 510.671875,
|
|
"completions/mean_terminated_length": 510.671875,
|
|
"completions/min_length": 149.5,
|
|
"completions/min_terminated_length": 149.5,
|
|
"entropy": 1.1568988785147667,
|
|
"epoch": 0.007822222222222222,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 1.0407740082069517,
|
|
"kl": 0.01194384231348522,
|
|
"learning_rate": 2.183372119961499e-08,
|
|
"loss": -0.0151,
|
|
"num_tokens": 1762194.0,
|
|
"reward": 1.03125,
|
|
"reward_std": 0.3549068421125412,
|
|
"rewards/equation_reward_func/mean": 0.078125,
|
|
"rewards/equation_reward_func/std": 0.2710396274924278,
|
|
"rewards/format_reward_func/mean": 0.953125,
|
|
"rewards/format_reward_func/std": 0.21135568618774414,
|
|
"sampling/importance_sampling_ratio/max": 2.4364527463912964,
|
|
"sampling/importance_sampling_ratio/mean": 0.7585195302963257,
|
|
"sampling/importance_sampling_ratio/min": 0.021672163158655167,
|
|
"sampling/sampling_logp_difference/max": 0.5484427809715271,
|
|
"sampling/sampling_logp_difference/mean": 0.023598327301442623,
|
|
"step": 88,
|
|
"step_time": 11.175127660972066
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 786.5,
|
|
"completions/mean_length": 464.078125,
|
|
"completions/mean_terminated_length": 437.3440856933594,
|
|
"completions/min_length": 165.0,
|
|
"completions/min_terminated_length": 165.0,
|
|
"entropy": 1.149987280368805,
|
|
"epoch": 0.008,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.236076165142051,
|
|
"kl": 0.015072842070367187,
|
|
"learning_rate": 1.5698323748414122e-08,
|
|
"loss": 0.17,
|
|
"num_tokens": 1801639.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.3917950540781021,
|
|
"rewards/equation_reward_func/mean": 0.078125,
|
|
"rewards/equation_reward_func/std": 0.2710396274924278,
|
|
"rewards/format_reward_func/mean": 0.890625,
|
|
"rewards/format_reward_func/std": 0.31607766449451447,
|
|
"sampling/importance_sampling_ratio/max": 2.3776625394821167,
|
|
"sampling/importance_sampling_ratio/mean": 0.9204491376876831,
|
|
"sampling/importance_sampling_ratio/min": 0.040661390870809555,
|
|
"sampling/sampling_logp_difference/max": 0.4495445489883423,
|
|
"sampling/sampling_logp_difference/mean": 0.023287806659936905,
|
|
"step": 90,
|
|
"step_time": 11.483706569299102
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 940.5,
|
|
"completions/max_terminated_length": 940.5,
|
|
"completions/mean_length": 455.109375,
|
|
"completions/mean_terminated_length": 455.109375,
|
|
"completions/min_length": 176.0,
|
|
"completions/min_terminated_length": 176.0,
|
|
"entropy": 1.1072595864534378,
|
|
"epoch": 0.008177777777777779,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 1.0362568226042652,
|
|
"kl": 0.012551067571621388,
|
|
"learning_rate": 1.054566895300324e-08,
|
|
"loss": 0.0268,
|
|
"num_tokens": 1840558.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.3083590194582939,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.1480722874403,
|
|
"rewards/format_reward_func/mean": 0.9375,
|
|
"rewards/format_reward_func/std": 0.2364606335759163,
|
|
"sampling/importance_sampling_ratio/max": 2.757253050804138,
|
|
"sampling/importance_sampling_ratio/mean": 0.8551563322544098,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.5598160922527313,
|
|
"sampling/sampling_logp_difference/mean": 0.023077418096363544,
|
|
"step": 92,
|
|
"step_time": 11.143339851987548
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 966.5,
|
|
"completions/max_terminated_length": 867.5,
|
|
"completions/mean_length": 463.78125,
|
|
"completions/mean_terminated_length": 455.2777099609375,
|
|
"completions/min_length": 182.0,
|
|
"completions/min_terminated_length": 182.0,
|
|
"entropy": 1.143265001475811,
|
|
"epoch": 0.008355555555555555,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 0.0025484313004221675,
|
|
"kl": 0.0112493826309219,
|
|
"learning_rate": 6.397368838268496e-09,
|
|
"loss": 0.0941,
|
|
"num_tokens": 1880016.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.19507546722888947,
|
|
"rewards/equation_reward_func/mean": 0.015625,
|
|
"rewards/equation_reward_func/std": 0.0883883461356163,
|
|
"rewards/format_reward_func/mean": 0.9375,
|
|
"rewards/format_reward_func/std": 0.16800537705421448,
|
|
"sampling/importance_sampling_ratio/max": 2.5143080949783325,
|
|
"sampling/importance_sampling_ratio/mean": 0.8339663445949554,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6856141686439514,
|
|
"sampling/sampling_logp_difference/mean": 0.023781022988259792,
|
|
"step": 94,
|
|
"step_time": 11.370905907824636
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 998.0,
|
|
"completions/max_terminated_length": 946.5,
|
|
"completions/mean_length": 464.390625,
|
|
"completions/mean_terminated_length": 455.1602783203125,
|
|
"completions/min_length": 192.5,
|
|
"completions/min_terminated_length": 192.5,
|
|
"entropy": 1.1343590430915356,
|
|
"epoch": 0.008533333333333334,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 2.0803027651220756,
|
|
"kl": 0.01363659166963771,
|
|
"learning_rate": 3.2708228165273244e-09,
|
|
"loss": 0.1137,
|
|
"num_tokens": 1919505.0,
|
|
"reward": 1.015625,
|
|
"reward_std": 0.37185215950012207,
|
|
"rewards/equation_reward_func/mean": 0.078125,
|
|
"rewards/equation_reward_func/std": 0.2563937231898308,
|
|
"rewards/format_reward_func/mean": 0.9375,
|
|
"rewards/format_reward_func/std": 0.2364606335759163,
|
|
"sampling/importance_sampling_ratio/max": 2.7199188470840454,
|
|
"sampling/importance_sampling_ratio/mean": 0.7955746948719025,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.569873571395874,
|
|
"sampling/sampling_logp_difference/mean": 0.0229880353435874,
|
|
"step": 96,
|
|
"step_time": 11.320387034327723
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 963.0,
|
|
"completions/max_terminated_length": 919.0,
|
|
"completions/mean_length": 465.53125,
|
|
"completions/mean_terminated_length": 456.23638916015625,
|
|
"completions/min_length": 199.0,
|
|
"completions/min_terminated_length": 199.0,
|
|
"entropy": 1.1051006130874157,
|
|
"epoch": 0.00871111111111111,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 1.38482587569594,
|
|
"kl": 0.010673947690520436,
|
|
"learning_rate": 1.1791447083465133e-09,
|
|
"loss": 0.034,
|
|
"num_tokens": 1959051.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.26799365133047104,
|
|
"rewards/equation_reward_func/mean": 0.03125,
|
|
"rewards/equation_reward_func/std": 0.12296734005212784,
|
|
"rewards/format_reward_func/mean": 0.953125,
|
|
"rewards/format_reward_func/std": 0.21135568618774414,
|
|
"sampling/importance_sampling_ratio/max": 2.532159209251404,
|
|
"sampling/importance_sampling_ratio/mean": 0.8075034618377686,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.4299898147583008,
|
|
"sampling/sampling_logp_difference/mean": 0.02303027454763651,
|
|
"step": 98,
|
|
"step_time": 11.303778560250066
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 915.5,
|
|
"completions/max_terminated_length": 915.5,
|
|
"completions/mean_length": 452.484375,
|
|
"completions/mean_terminated_length": 452.484375,
|
|
"completions/min_length": 153.0,
|
|
"completions/min_terminated_length": 153.0,
|
|
"entropy": 1.0667904503643513,
|
|
"epoch": 0.008888888888888889,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 0.8448295748463005,
|
|
"kl": 0.018678127584280446,
|
|
"learning_rate": 1.3110773862126667e-10,
|
|
"loss": 0.0456,
|
|
"num_tokens": 1997698.0,
|
|
"reward": 1.015625,
|
|
"reward_std": 0.2710396274924278,
|
|
"rewards/equation_reward_func/mean": 0.046875,
|
|
"rewards/equation_reward_func/std": 0.1480722874403,
|
|
"rewards/format_reward_func/mean": 0.96875,
|
|
"rewards/format_reward_func/std": 0.12296734005212784,
|
|
"sampling/importance_sampling_ratio/max": 2.723245859146118,
|
|
"sampling/importance_sampling_ratio/mean": 0.8844414949417114,
|
|
"sampling/importance_sampling_ratio/min": 0.0,
|
|
"sampling/sampling_logp_difference/max": 0.6081928014755249,
|
|
"sampling/sampling_logp_difference/mean": 0.02220182679593563,
|
|
"step": 100,
|
|
"step_time": 11.397719835629687
|
|
},
|
|
{
|
|
"epoch": 0.008888888888888889,
|
|
"step": 100,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.06438189143314958,
|
|
"train_runtime": 1344.9766,
|
|
"train_samples_per_second": 2.379,
|
|
"train_steps_per_second": 0.074
|
|
}
|
|
],
|
|
"logging_steps": 2,
|
|
"max_steps": 100,
|
|
"num_input_tokens_seen": 1997698,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 25,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|