{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008888888888888889, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 579.78125, "completions/mean_terminated_length": 516.3214416503906, "completions/min_length": 165.5, "completions/min_terminated_length": 165.5, "entropy": 1.209167167544365, "epoch": 0.00017777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 1.0451635456006854, "kl": 0.0, "learning_rate": 1.6666666666666665e-07, "loss": 0.0857, "num_tokens": 46890.0, "reward": 0.234375, "reward_std": 0.4299773871898651, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21135568618774414, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "sampling/importance_sampling_ratio/max": 2.733101010322571, "sampling/importance_sampling_ratio/mean": 0.7042568325996399, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6484301090240479, "sampling/sampling_logp_difference/mean": 0.023067950271070004, "step": 2, "step_time": 13.367690222221427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 511.703125, "completions/mean_terminated_length": 486.53440856933594, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 1.150175679475069, "epoch": 0.00035555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 1.3727134073809244, "kl": 0.0004912198037345661, "learning_rate": 5e-07, "loss": 0.1102, "num_tokens": 89431.0, "reward": 0.28125, "reward_std": 0.45680341124534607, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1767766922712326, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "sampling/importance_sampling_ratio/max": 2.4883073568344116, "sampling/importance_sampling_ratio/mean": 0.6917034089565277, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7433785200119019, "sampling/sampling_logp_difference/mean": 0.02369655668735504, "step": 4, "step_time": 11.878318977192976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.5, "completions/mean_length": 543.25, "completions/mean_terminated_length": 502.3490753173828, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 1.2386715859174728, "epoch": 0.0005333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.4488715982157625, "kl": 0.0005299820822983747, "learning_rate": 4.994757065594279e-07, "loss": 0.2079, "num_tokens": 133991.0, "reward": 0.265625, "reward_std": 0.44547125697135925, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.12296734005212784, "rewards/format_reward_func/mean": 0.234375, "rewards/format_reward_func/std": 0.4299773871898651, "sampling/importance_sampling_ratio/max": 2.242772936820984, "sampling/importance_sampling_ratio/mean": 0.7461096942424774, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5018917918205261, "sampling/sampling_logp_difference/mean": 0.02481890842318535, "step": 6, "step_time": 11.690943353110924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 464.96875, "completions/mean_terminated_length": 446.9354705810547, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 1.145722646266222, "epoch": 0.0007111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.5202398474744325, "kl": 0.0004978575179848121, "learning_rate": 4.979050253066063e-07, "loss": 0.0416, "num_tokens": 173509.0, "reward": 0.265625, "reward_std": 0.44837237894535065, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.43840841948986053, "sampling/importance_sampling_ratio/max": 1.867400348186493, "sampling/importance_sampling_ratio/mean": 0.6012143194675446, "sampling/importance_sampling_ratio/min": 0.08801080286502838, "sampling/sampling_logp_difference/max": 0.5550211668014526, "sampling/sampling_logp_difference/mean": 0.023779811337590218, "step": 8, "step_time": 11.771737957373261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1023.5, "completions/max_terminated_length": 999.0, "completions/mean_length": 512.875, "completions/mean_terminated_length": 496.10626220703125, "completions/min_length": 166.5, "completions/min_terminated_length": 166.5, "entropy": 1.1838660538196564, "epoch": 0.0008888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 0.872521471943838, "kl": 0.000666440657369094, "learning_rate": 4.952945442245597e-07, "loss": 0.0792, "num_tokens": 216061.0, "reward": 0.34375, "reward_std": 0.5400001406669617, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.1480722874403, "rewards/format_reward_func/mean": 0.296875, "rewards/format_reward_func/std": 0.4638662487268448, "sampling/importance_sampling_ratio/max": 2.8262017965316772, "sampling/importance_sampling_ratio/mean": 0.6557013094425201, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.612011194229126, "sampling/sampling_logp_difference/mean": 0.02344651333987713, "step": 10, "step_time": 11.873887687921524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 918.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 463.296875, "completions/mean_terminated_length": 454.6552276611328, "completions/min_length": 152.5, "completions/min_terminated_length": 152.5, "entropy": 1.1539763174951077, "epoch": 0.0010666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.5031066308786476, "kl": 0.0008330309847224271, "learning_rate": 4.916552125781528e-07, "loss": 0.0814, "num_tokens": 255432.0, "reward": 0.53125, "reward_std": 0.5670737028121948, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24593468010425568, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "sampling/importance_sampling_ratio/max": 2.125624656677246, "sampling/importance_sampling_ratio/mean": 0.7945938110351562, "sampling/importance_sampling_ratio/min": 0.01008664257824421, "sampling/sampling_logp_difference/max": 0.6622226238250732, "sampling/sampling_logp_difference/mean": 0.023904340341687202, "step": 12, "step_time": 11.331206714501604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.5, "completions/mean_length": 559.359375, "completions/mean_terminated_length": 520.5178833007812, "completions/min_length": 217.5, "completions/min_terminated_length": 217.5, "entropy": 1.2339040115475655, "epoch": 0.0012444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 1.3283721770836783, "kl": 0.0012322509137447923, "learning_rate": 4.870022949890676e-07, "loss": 0.0188, "num_tokens": 301031.0, "reward": 0.421875, "reward_std": 0.5294715315103531, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.4979427307844162, "sampling/importance_sampling_ratio/max": 2.192352533340454, "sampling/importance_sampling_ratio/mean": 0.6494036614894867, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9156644344329834, "sampling/sampling_logp_difference/mean": 0.024133121594786644, "step": 14, "step_time": 11.763766457792372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 472.375, "completions/mean_terminated_length": 454.58062744140625, "completions/min_length": 152.5, "completions/min_terminated_length": 152.5, "entropy": 1.2082068026065826, "epoch": 0.0014222222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.9893722134021776, "kl": 0.002285250819113571, "learning_rate": 4.81355307410676e-07, "loss": -0.0683, "num_tokens": 341063.0, "reward": 0.5, "reward_std": 0.5569138079881668, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.1480722874403, "rewards/format_reward_func/mean": 0.453125, "rewards/format_reward_func/std": 0.5034956932067871, "sampling/importance_sampling_ratio/max": 2.4907275438308716, "sampling/importance_sampling_ratio/mean": 0.8273349404335022, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7957509160041809, "sampling/sampling_logp_difference/mean": 0.023882606998085976, "step": 16, "step_time": 11.711858246591873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 951.5, "completions/max_terminated_length": 923.5, "completions/mean_length": 427.71875, "completions/mean_terminated_length": 418.4737854003906, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "entropy": 1.137543722987175, "epoch": 0.0016, "frac_reward_zero_std": 0.0, "grad_norm": 1.5075918715668166, "kl": 0.002901972911786288, "learning_rate": 4.747379352713488e-07, "loss": 0.0204, "num_tokens": 378157.0, "reward": 0.625, "reward_std": 0.6221709847450256, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.2961445748806, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.4907747954130173, "sampling/importance_sampling_ratio/max": 2.210850417613983, "sampling/importance_sampling_ratio/mean": 0.7625480890274048, "sampling/importance_sampling_ratio/min": 0.005571374204009771, "sampling/sampling_logp_difference/max": 0.5609427690505981, "sampling/sampling_logp_difference/mean": 0.02348453551530838, "step": 18, "step_time": 11.282247766968794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 899.0, "completions/max_terminated_length": 825.5, "completions/mean_length": 447.6875, "completions/mean_terminated_length": 438.1275177001953, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "entropy": 1.1992060840129852, "epoch": 0.0017777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 1.6052725065567501, "kl": 0.004966819164110348, "learning_rate": 4.6717793412953776e-07, "loss": 0.17, "num_tokens": 416601.0, "reward": 0.578125, "reward_std": 0.5266626179218292, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1767766922712326, "rewards/format_reward_func/mean": 0.546875, "rewards/format_reward_func/std": 0.4994383603334427, "sampling/importance_sampling_ratio/max": 2.251875877380371, "sampling/importance_sampling_ratio/mean": 0.7426749467849731, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4947061240673065, "sampling/sampling_logp_difference/mean": 0.024829605594277382, "step": 20, "step_time": 11.085346231702715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 931.5, "completions/max_terminated_length": 833.5, "completions/mean_length": 474.609375, "completions/mean_terminated_length": 465.7101745605469, "completions/min_length": 155.5, "completions/min_terminated_length": 155.5, "entropy": 1.189589723944664, "epoch": 0.0019555555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 1.568319079842556, "kl": 0.005503996828338131, "learning_rate": 4.5870701325731773e-07, "loss": 0.1169, "num_tokens": 456736.0, "reward": 0.59375, "reward_std": 0.5569138079881668, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.12296734005212784, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.5029991269111633, "sampling/importance_sampling_ratio/max": 2.329828381538391, "sampling/importance_sampling_ratio/mean": 0.7178902626037598, "sampling/importance_sampling_ratio/min": 0.01708686724305153, "sampling/sampling_logp_difference/max": 0.7085881531238556, "sampling/sampling_logp_difference/mean": 0.023770778439939022, "step": 22, "step_time": 11.324711623368785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 485.90625, "completions/mean_terminated_length": 468.5483856201172, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 1.2347300872206688, "epoch": 0.0021333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.5103693400119262, "kl": 0.0056485196109861135, "learning_rate": 4.4936070264068016e-07, "loss": 0.0751, "num_tokens": 497594.0, "reward": 0.65625, "reward_std": 0.5729349255561829, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.2364606335759163, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "sampling/importance_sampling_ratio/max": 2.513130784034729, "sampling/importance_sampling_ratio/mean": 0.6623396277427673, "sampling/importance_sampling_ratio/min": 0.052559204399585724, "sampling/sampling_logp_difference/max": 0.6741056442260742, "sampling/sampling_logp_difference/mean": 0.024482053704559803, "step": 24, "step_time": 11.668942423537374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 902.5, "completions/max_terminated_length": 803.0, "completions/mean_length": 479.09375, "completions/mean_terminated_length": 469.92791748046875, "completions/min_length": 175.5, "completions/min_terminated_length": 175.5, "entropy": 1.2349426448345184, "epoch": 0.002311111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.3908582983128641, "kl": 0.008100319362711161, "learning_rate": 4.391782039544238e-07, "loss": 0.0008, "num_tokens": 538080.0, "reward": 0.671875, "reward_std": 0.532937303185463, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21135568618774414, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.4874725937843323, "sampling/importance_sampling_ratio/max": 1.898095965385437, "sampling/importance_sampling_ratio/mean": 0.6496416926383972, "sampling/importance_sampling_ratio/min": 0.020795006304979324, "sampling/sampling_logp_difference/max": 0.6364502906799316, "sampling/sampling_logp_difference/mean": 0.024783543311059475, "step": 26, "step_time": 11.404050085111521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 475.859375, "completions/mean_terminated_length": 458.1773986816406, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 1.2003328688442707, "epoch": 0.002488888888888889, "frac_reward_zero_std": 0.125, "grad_norm": 1.138678539986136, "kl": 0.007092042971635237, "learning_rate": 4.282022261367073e-07, "loss": -0.0762, "num_tokens": 578223.0, "reward": 0.734375, "reward_std": 0.5117992758750916, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.12296734005212784, "rewards/format_reward_func/mean": 0.703125, "rewards/format_reward_func/std": 0.4638662487268448, "sampling/importance_sampling_ratio/max": 2.868867874145508, "sampling/importance_sampling_ratio/mean": 0.8748505115509033, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.386928454041481, "sampling/sampling_logp_difference/mean": 0.02374311164021492, "step": 28, "step_time": 12.464717164519243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 950.5, "completions/max_terminated_length": 844.0, "completions/mean_length": 486.171875, "completions/mean_terminated_length": 477.2635955810547, "completions/min_length": 157.5, "completions/min_terminated_length": 157.5, "entropy": 1.1861281506717205, "epoch": 0.0026666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.0375770292654916, "kl": 0.009247849928215146, "learning_rate": 4.1647880625292027e-07, "loss": -0.0282, "num_tokens": 619074.0, "reward": 0.78125, "reward_std": 0.5395806133747101, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.16800537705421448, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45128606259822845, "sampling/importance_sampling_ratio/max": 2.3657628297805786, "sampling/importance_sampling_ratio/mean": 0.7253041863441467, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46275559067726135, "sampling/sampling_logp_difference/mean": 0.023301721550524235, "step": 30, "step_time": 13.156236473936588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.5, "completions/max_terminated_length": 847.5, "completions/mean_length": 408.296875, "completions/mean_terminated_length": 408.296875, "completions/min_length": 180.5, "completions/min_terminated_length": 180.5, "entropy": 1.1347957476973534, "epoch": 0.0028444444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.3739216172676234, "kl": 0.011383652017684653, "learning_rate": 4.040571164002318e-07, "loss": 0.2237, "num_tokens": 654981.0, "reward": 0.859375, "reward_std": 0.5548828095197678, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.3074183538556099, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.43840841948986053, "sampling/importance_sampling_ratio/max": 2.9178740978240967, "sampling/importance_sampling_ratio/mean": 0.8742310702800751, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6507634520530701, "sampling/sampling_logp_difference/mean": 0.02339162491261959, "step": 32, "step_time": 11.533732229378074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 459.3125, "completions/mean_terminated_length": 459.3125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 1.1569931730628014, "epoch": 0.003022222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 1.6685795867340696, "kl": 0.008567514276364818, "learning_rate": 3.909892574627266e-07, "loss": 0.1375, "num_tokens": 694097.0, "reward": 1.0, "reward_std": 0.5303300619125366, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.34635117650032043, "rewards/format_reward_func/mean": 0.859375, "rewards/format_reward_func/std": 0.34635117650032043, "sampling/importance_sampling_ratio/max": 2.5157090425491333, "sampling/importance_sampling_ratio/mean": 0.7955585420131683, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4740889072418213, "sampling/sampling_logp_difference/mean": 0.02304172795265913, "step": 34, "step_time": 10.761377388611436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 480.90625, "completions/mean_terminated_length": 480.90625, "completions/min_length": 162.5, "completions/min_terminated_length": 162.5, "entropy": 1.2352817580103874, "epoch": 0.0032, "frac_reward_zero_std": 0.125, "grad_norm": 1.1771067362481362, "kl": 0.010575773718301207, "learning_rate": 3.773300405821908e-07, "loss": -0.0276, "num_tokens": 734619.0, "reward": 0.84375, "reward_std": 0.5194454491138458, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.19827888906002045, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4337434321641922, "sampling/importance_sampling_ratio/max": 2.6484490633010864, "sampling/importance_sampling_ratio/mean": 0.782801479101181, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4273035526275635, "sampling/sampling_logp_difference/mean": 0.024281970225274563, "step": 36, "step_time": 11.392230691039003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 886.5, "completions/max_terminated_length": 880.5, "completions/mean_length": 444.296875, "completions/mean_terminated_length": 436.0539245605469, "completions/min_length": 162.5, "completions/min_terminated_length": 162.5, "entropy": 1.15547876060009, "epoch": 0.0033777777777777777, "frac_reward_zero_std": 0.125, "grad_norm": 3.7113272904633505, "kl": 0.01313028542790562, "learning_rate": 3.6313675726113475e-07, "loss": 0.066, "num_tokens": 772838.0, "reward": 0.921875, "reward_std": 0.40442168712615967, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.1480722874403, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33252330124378204, "sampling/importance_sampling_ratio/max": 2.566834807395935, "sampling/importance_sampling_ratio/mean": 0.829992413520813, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5897403955459595, "sampling/sampling_logp_difference/mean": 0.023365739732980728, "step": 38, "step_time": 10.986438499065116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 527.90625, "completions/mean_terminated_length": 503.8693542480469, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 1.2300068363547325, "epoch": 0.0035555555555555557, "frac_reward_zero_std": 0.125, "grad_norm": 1.6216510890549307, "kl": 0.01843011923483573, "learning_rate": 3.484689390623218e-07, "loss": 0.0636, "num_tokens": 816408.0, "reward": 0.796875, "reward_std": 0.4437006711959839, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.41824956238269806, "sampling/importance_sampling_ratio/max": 2.566046714782715, "sampling/importance_sampling_ratio/mean": 0.8348058462142944, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2292534112930298, "sampling/sampling_logp_difference/mean": 0.0241514528170228, "step": 40, "step_time": 11.432389895198867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 916.5, "completions/max_terminated_length": 799.0, "completions/mean_length": 489.21875, "completions/mean_terminated_length": 480.28125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 1.2188931107521057, "epoch": 0.0037333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.9636777693135964, "kl": 0.01053301602951251, "learning_rate": 3.3338810791270517e-07, "loss": 0.0198, "num_tokens": 857494.0, "reward": 0.921875, "reward_std": 0.4847814738750458, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2710396274924278, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "sampling/importance_sampling_ratio/max": 1.9368165135383606, "sampling/importance_sampling_ratio/mean": 0.5968808531761169, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6993489265441895, "sampling/sampling_logp_difference/mean": 0.02366031240671873, "step": 42, "step_time": 11.470696586417034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1009.5, "completions/max_terminated_length": 985.0, "completions/mean_length": 440.140625, "completions/mean_terminated_length": 431.2227783203125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 1.1604873463511467, "epoch": 0.003911111111111111, "frac_reward_zero_std": 0.125, "grad_norm": 1.2045214320430864, "kl": 0.010937573766568676, "learning_rate": 3.179575180590857e-07, "loss": -0.0545, "num_tokens": 895415.0, "reward": 0.921875, "reward_std": 0.43845126032829285, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.18445101380348206, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "sampling/importance_sampling_ratio/max": 2.7295889854431152, "sampling/importance_sampling_ratio/mean": 0.689439982175827, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5454497933387756, "sampling/sampling_logp_difference/mean": 0.023939015343785286, "step": 44, "step_time": 11.862673799390905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.5, "completions/mean_length": 501.640625, "completions/mean_terminated_length": 466.8166809082031, "completions/min_length": 209.5, "completions/min_terminated_length": 209.5, "entropy": 1.2363386303186417, "epoch": 0.004088888888888889, "frac_reward_zero_std": 0.125, "grad_norm": 1.7502632857924376, "kl": 0.008666158799314871, "learning_rate": 3.022418907578188e-07, "loss": 0.0883, "num_tokens": 937352.0, "reward": 0.9375, "reward_std": 0.49288448691368103, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.27283935993909836, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3662842661142349, "sampling/importance_sampling_ratio/max": 2.052061378955841, "sampling/importance_sampling_ratio/mean": 0.7052308619022369, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9969043135643005, "sampling/sampling_logp_difference/mean": 0.02375571522861719, "step": 46, "step_time": 11.741644776426256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 458.84375, "completions/mean_terminated_length": 458.84375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 1.1406226977705956, "epoch": 0.004266666666666667, "frac_reward_zero_std": 0.125, "grad_norm": 2.1569663279164213, "kl": 0.011335209826938808, "learning_rate": 2.863071428113726e-07, "loss": 0.0549, "num_tokens": 976550.0, "reward": 0.9375, "reward_std": 0.458977147936821, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2563937231898308, "rewards/format_reward_func/mean": 0.859375, "rewards/format_reward_func/std": 0.34635117650032043, "sampling/importance_sampling_ratio/max": 2.231783628463745, "sampling/importance_sampling_ratio/mean": 0.7619136869907379, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5767736434936523, "sampling/sampling_logp_difference/mean": 0.023336266167461872, "step": 48, "step_time": 11.434072194388136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.5, "completions/max_terminated_length": 920.5, "completions/mean_length": 459.34375, "completions/mean_terminated_length": 459.34375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 1.2100868672132492, "epoch": 0.0044444444444444444, "frac_reward_zero_std": 0.125, "grad_norm": 1.728209453317298, "kl": 0.01852725149365142, "learning_rate": 2.7022011009035107e-07, "loss": 0.1187, "num_tokens": 1015740.0, "reward": 0.890625, "reward_std": 0.4356408715248108, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21135568618774414, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3580790013074875, "sampling/importance_sampling_ratio/max": 2.3857074975967407, "sampling/importance_sampling_ratio/mean": 0.827456921339035, "sampling/importance_sampling_ratio/min": 0.08825718238949776, "sampling/sampling_logp_difference/max": 0.6624305248260498, "sampling/sampling_logp_difference/mean": 0.023832999169826508, "step": 50, "step_time": 11.557595945429057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 909.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 452.5, "completions/mean_terminated_length": 443.2046356201172, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 1.1419423446059227, "epoch": 0.004622222222222222, "frac_reward_zero_std": 0.25, "grad_norm": 1.6088939390705712, "kl": 0.012844312441302463, "learning_rate": 2.540482672006254e-07, "loss": -0.0029, "num_tokens": 1054428.0, "reward": 0.96875, "reward_std": 0.46261392533779144, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.27283935993909836, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33252330124378204, "sampling/importance_sampling_ratio/max": 2.2242526412010193, "sampling/importance_sampling_ratio/mean": 0.7612048387527466, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6042303442955017, "sampling/sampling_logp_difference/mean": 0.023195499554276466, "step": 52, "step_time": 11.590359390014783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 422.1875, "completions/mean_terminated_length": 402.77418518066406, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 1.0865775719285011, "epoch": 0.0048, "frac_reward_zero_std": 0.125, "grad_norm": 1.370478804799173, "kl": 0.012116663216147572, "learning_rate": 2.37859444471388e-07, "loss": 0.0846, "num_tokens": 1091144.0, "reward": 1.109375, "reward_std": 0.47186581790447235, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.378012090921402, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.2364606335759163, "sampling/importance_sampling_ratio/max": 2.3138152956962585, "sampling/importance_sampling_ratio/mean": 0.7851577699184418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5188883543014526, "sampling/sampling_logp_difference/mean": 0.022715235128998756, "step": 54, "step_time": 13.548070080811158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 452.09375, "completions/mean_terminated_length": 452.09375, "completions/min_length": 160.5, "completions/min_terminated_length": 160.5, "entropy": 1.1417308785021305, "epoch": 0.004977777777777778, "frac_reward_zero_std": 0.5, "grad_norm": 1.4286278134907313, "kl": 0.012324006151175126, "learning_rate": 2.2172154345117894e-07, "loss": 0.0483, "num_tokens": 1129814.0, "reward": 0.9375, "reward_std": 0.30280280113220215, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2710396274924278, "sampling/importance_sampling_ratio/max": 2.3811429738998413, "sampling/importance_sampling_ratio/mean": 0.8520744442939758, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6289049386978149, "sampling/sampling_logp_difference/mean": 0.02345012128353119, "step": 56, "step_time": 12.447777467081323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 931.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 443.71875, "completions/mean_terminated_length": 434.61692810058594, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 1.1414429359138012, "epoch": 0.005155555555555556, "frac_reward_zero_std": 0.25, "grad_norm": 1.63905040062684, "kl": 0.018355112959397957, "learning_rate": 2.0570225210519433e-07, "loss": 0.0939, "num_tokens": 1167916.0, "reward": 0.984375, "reward_std": 0.4889104962348938, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31607766449451447, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33252330124378204, "sampling/importance_sampling_ratio/max": 1.8928932547569275, "sampling/importance_sampling_ratio/mean": 0.7523285746574402, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5896574258804321, "sampling/sampling_logp_difference/mean": 0.02342431340366602, "step": 58, "step_time": 10.975074479705654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 958.5, "completions/max_terminated_length": 940.5, "completions/mean_length": 453.6875, "completions/mean_terminated_length": 444.2217712402344, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 1.136627346277237, "epoch": 0.005333333333333333, "frac_reward_zero_std": 0.25, "grad_norm": 1.401252124600878, "kl": 0.010728320659836754, "learning_rate": 1.8986876090843664e-07, "loss": 0.0514, "num_tokens": 1206712.0, "reward": 1.0, "reward_std": 0.346970796585083, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.2364606335759163, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.2364606335759163, "sampling/importance_sampling_ratio/max": 2.603978991508484, "sampling/importance_sampling_ratio/mean": 0.6841319799423218, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5754774510860443, "sampling/sampling_logp_difference/mean": 0.02338473778218031, "step": 60, "step_time": 11.381625101552345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 465.359375, "completions/mean_terminated_length": 447.3386993408203, "completions/min_length": 151.5, "completions/min_terminated_length": 151.5, "entropy": 1.1641199924051762, "epoch": 0.005511111111111111, "frac_reward_zero_std": 0.25, "grad_norm": 1.0389216878895204, "kl": 0.012553387088701129, "learning_rate": 1.7428748102551234e-07, "loss": -0.0206, "num_tokens": 1246335.0, "reward": 0.859375, "reward_std": 0.42767396569252014, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.12296734005212784, "rewards/format_reward_func/mean": 0.828125, "rewards/format_reward_func/std": 0.378012090921402, "sampling/importance_sampling_ratio/max": 2.5562033653259277, "sampling/importance_sampling_ratio/mean": 0.6826831996440887, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.546184629201889, "sampling/sampling_logp_difference/mean": 0.02384145464748144, "step": 62, "step_time": 11.716152066364884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 472.4375, "completions/mean_terminated_length": 472.4375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 1.1974952705204487, "epoch": 0.005688888888888889, "frac_reward_zero_std": 0.375, "grad_norm": 0.8405866770711337, "kl": 0.011443487601354718, "learning_rate": 1.5902376575912814e-07, "loss": 0.0695, "num_tokens": 1286315.0, "reward": 0.984375, "reward_std": 0.43038569390773773, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.19827888906002045, "rewards/format_reward_func/mean": 0.890625, "rewards/format_reward_func/std": 0.3074183538556099, "sampling/importance_sampling_ratio/max": 2.7265862226486206, "sampling/importance_sampling_ratio/mean": 0.8386669158935547, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4937933683395386, "sampling/sampling_logp_difference/mean": 0.024114561267197132, "step": 64, "step_time": 11.237366109970026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 970.5, "completions/max_terminated_length": 838.5, "completions/mean_length": 450.359375, "completions/mean_terminated_length": 441.2227783203125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 1.1044728867709637, "epoch": 0.005866666666666667, "frac_reward_zero_std": 0.25, "grad_norm": 1.423044848014041, "kl": 0.010465700703207403, "learning_rate": 1.4414163643562753e-07, "loss": 0.2456, "num_tokens": 1324906.0, "reward": 1.0625, "reward_std": 0.49249379336833954, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3662842661142349, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.27283935993909836, "sampling/importance_sampling_ratio/max": 2.4483842849731445, "sampling/importance_sampling_ratio/mean": 0.8248867988586426, "sampling/importance_sampling_ratio/min": 0.05962574481964111, "sampling/sampling_logp_difference/max": 0.7402658462524414, "sampling/sampling_logp_difference/mean": 0.022917624562978745, "step": 66, "step_time": 11.092585265287198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.5, "completions/max_terminated_length": 834.5, "completions/mean_length": 434.6875, "completions/mean_terminated_length": 434.6875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 1.1426044255495071, "epoch": 0.006044444444444444, "frac_reward_zero_std": 0.25, "grad_norm": 2.225802835067467, "kl": 0.013302074105013162, "learning_rate": 1.2970351387729872e-07, "loss": 0.027, "num_tokens": 1362414.0, "reward": 1.015625, "reward_std": 0.3343358188867569, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24593468010425568, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21135568618774414, "sampling/importance_sampling_ratio/max": 2.4642796516418457, "sampling/importance_sampling_ratio/mean": 0.7873663902282715, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.51364666223526, "sampling/sampling_logp_difference/mean": 0.023219610564410686, "step": 68, "step_time": 10.780604753526859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.5, "completions/mean_length": 473.90625, "completions/mean_terminated_length": 456.1612854003906, "completions/min_length": 175.5, "completions/min_terminated_length": 175.5, "entropy": 1.1317069344222546, "epoch": 0.006222222222222222, "frac_reward_zero_std": 0.25, "grad_norm": 0.8320287770406333, "kl": 0.009612397610908374, "learning_rate": 1.1576995658775404e-07, "loss": 0.1579, "num_tokens": 1402544.0, "reward": 0.9375, "reward_std": 0.3952517956495285, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21135568618774414, "rewards/format_reward_func/mean": 0.890625, "rewards/format_reward_func/std": 0.31607766449451447, "sampling/importance_sampling_ratio/max": 2.471542716026306, "sampling/importance_sampling_ratio/mean": 0.7965124845504761, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6067328453063965, "sampling/sampling_logp_difference/mean": 0.023130498826503754, "step": 70, "step_time": 11.402438224526122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.5, "completions/max_terminated_length": 980.5, "completions/mean_length": 444.0625, "completions/mean_terminated_length": 444.0625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 1.1372172087430954, "epoch": 0.0064, "frac_reward_zero_std": 0.25, "grad_norm": 2.599974610451281, "kl": 0.018978118430823088, "learning_rate": 1.0239940674851941e-07, "loss": 0.1768, "num_tokens": 1440692.0, "reward": 0.90625, "reward_std": 0.3873825669288635, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1767766922712326, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33252330124378204, "sampling/importance_sampling_ratio/max": 2.7498836517333984, "sampling/importance_sampling_ratio/mean": 0.9847005903720856, "sampling/importance_sampling_ratio/min": 0.034967128187417984, "sampling/sampling_logp_difference/max": 0.48557257652282715, "sampling/sampling_logp_difference/mean": 0.023576208390295506, "step": 72, "step_time": 11.201861241133884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 806.5, "completions/mean_length": 495.859375, "completions/mean_terminated_length": 469.6763458251953, "completions/min_length": 194.5, "completions/min_terminated_length": 194.5, "entropy": 1.099432535469532, "epoch": 0.006577777777777778, "frac_reward_zero_std": 0.25, "grad_norm": 0.84127080621119, "kl": 0.010809146391693503, "learning_rate": 8.964794509221507e-08, "loss": 0.006, "num_tokens": 1482227.0, "reward": 0.90625, "reward_std": 0.4278488904237747, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21135568618774414, "rewards/format_reward_func/mean": 0.859375, "rewards/format_reward_func/std": 0.35245639085769653, "sampling/importance_sampling_ratio/max": 2.2808090448379517, "sampling/importance_sampling_ratio/mean": 0.7656073570251465, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6886538863182068, "sampling/sampling_logp_difference/mean": 0.022277969866991043, "step": 74, "step_time": 11.536407661740668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 982.0, "completions/max_terminated_length": 961.5, "completions/mean_length": 452.0625, "completions/mean_terminated_length": 443.0055389404297, "completions/min_length": 176.5, "completions/min_terminated_length": 176.5, "entropy": 1.1252660602331161, "epoch": 0.0067555555555555554, "frac_reward_zero_std": 0.5, "grad_norm": 1.0980278842121856, "kl": 0.012123662454541773, "learning_rate": 7.756905568047392e-08, "loss": -0.0284, "num_tokens": 1520855.0, "reward": 0.984375, "reward_std": 0.37185215950012207, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.2364606335759163, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2563937231898308, "sampling/importance_sampling_ratio/max": 2.532285451889038, "sampling/importance_sampling_ratio/mean": 0.7447502017021179, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8389689922332764, "sampling/sampling_logp_difference/mean": 0.022948664613068104, "step": 76, "step_time": 11.632032112334855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 937.0, "completions/max_terminated_length": 863.5, "completions/mean_length": 457.578125, "completions/mean_terminated_length": 449.3684387207031, "completions/min_length": 199.5, "completions/min_terminated_length": 199.5, "entropy": 1.1281827799975872, "epoch": 0.006933333333333333, "frac_reward_zero_std": 0.375, "grad_norm": 0.7703809090962176, "kl": 0.011204674287000671, "learning_rate": 6.621340157319996e-08, "loss": 0.025, "num_tokens": 1559956.0, "reward": 0.984375, "reward_std": 0.26799365133047104, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.12296734005212784, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21135568618774414, "sampling/importance_sampling_ratio/max": 2.72747004032135, "sampling/importance_sampling_ratio/mean": 0.7423470318317413, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5065812468528748, "sampling/sampling_logp_difference/mean": 0.023134512826800346, "step": 78, "step_time": 11.912406253744848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 917.5, "completions/max_terminated_length": 906.0, "completions/mean_length": 449.234375, "completions/mean_terminated_length": 439.91380310058594, "completions/min_length": 168.5, "completions/min_terminated_length": 168.5, "entropy": 1.1397205702960491, "epoch": 0.0071111111111111115, "frac_reward_zero_std": 0.25, "grad_norm": 1.157999149874959, "kl": 0.010615026520099491, "learning_rate": 5.5628612330087724e-08, "loss": 0.1383, "num_tokens": 1598475.0, "reward": 0.96875, "reward_std": 0.348248615860939, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.1480722874403, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2710396274924278, "sampling/importance_sampling_ratio/max": 2.596957802772522, "sampling/importance_sampling_ratio/mean": 0.781058132648468, "sampling/importance_sampling_ratio/min": 0.047918595373630524, "sampling/sampling_logp_difference/max": 0.6418575942516327, "sampling/sampling_logp_difference/mean": 0.023468288592994213, "step": 80, "step_time": 13.718183192540891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.5, "completions/mean_length": 480.703125, "completions/mean_terminated_length": 453.83656311035156, "completions/min_length": 184.5, "completions/min_terminated_length": 184.5, "entropy": 1.099193848669529, "epoch": 0.007288888888888889, "frac_reward_zero_std": 0.5, "grad_norm": 0.6986674805632416, "kl": 0.01772310130763799, "learning_rate": 4.5859084235697235e-08, "loss": 0.0228, "num_tokens": 1639032.0, "reward": 1.0, "reward_std": 0.3549068421125412, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.2364606335759163, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.1860596537590027, "sampling/importance_sampling_ratio/mean": 0.8112323880195618, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4584404528141022, "sampling/sampling_logp_difference/mean": 0.022659837268292904, "step": 82, "step_time": 12.119231592281722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.5, "completions/max_terminated_length": 1011.5, "completions/mean_length": 462.8125, "completions/mean_terminated_length": 462.8125, "completions/min_length": 165.5, "completions/min_terminated_length": 165.5, "entropy": 1.1106774359941483, "epoch": 0.007466666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 0.4259301507183561, "kl": 0.010471002315171063, "learning_rate": 3.6945794086007705e-08, "loss": -0.0222, "num_tokens": 1678484.0, "reward": 0.984375, "reward_std": 0.27769785374403, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.12296734005212784, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21135568618774414, "sampling/importance_sampling_ratio/max": 2.785693407058716, "sampling/importance_sampling_ratio/mean": 0.7308302521705627, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5246871709823608, "sampling/sampling_logp_difference/mean": 0.023393068462610245, "step": 84, "step_time": 11.7816762131406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.5, "completions/mean_length": 492.796875, "completions/mean_terminated_length": 457.38336181640625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 1.1102127395570278, "epoch": 0.007644444444444444, "frac_reward_zero_std": 0.125, "grad_norm": 1.0147696038970972, "kl": 0.013029435416683555, "learning_rate": 2.892612731749414e-08, "loss": 0.1513, "num_tokens": 1719751.0, "reward": 0.953125, "reward_std": 0.4890725910663605, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31607766449451447, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "sampling/importance_sampling_ratio/max": 2.4325342178344727, "sampling/importance_sampling_ratio/mean": 0.8387036323547363, "sampling/importance_sampling_ratio/min": 0.044968899339437485, "sampling/sampling_logp_difference/max": 0.5210357308387756, "sampling/sampling_logp_difference/mean": 0.023624008521437645, "step": 86, "step_time": 11.889815866597928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 510.671875, "completions/mean_terminated_length": 510.671875, "completions/min_length": 149.5, "completions/min_terminated_length": 149.5, "entropy": 1.1568988785147667, "epoch": 0.007822222222222222, "frac_reward_zero_std": 0.5, "grad_norm": 1.0407740082069517, "kl": 0.01194384231348522, "learning_rate": 2.183372119961499e-08, "loss": -0.0151, "num_tokens": 1762194.0, "reward": 1.03125, "reward_std": 0.3549068421125412, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2710396274924278, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21135568618774414, "sampling/importance_sampling_ratio/max": 2.4364527463912964, "sampling/importance_sampling_ratio/mean": 0.7585195302963257, "sampling/importance_sampling_ratio/min": 0.021672163158655167, "sampling/sampling_logp_difference/max": 0.5484427809715271, "sampling/sampling_logp_difference/mean": 0.023598327301442623, "step": 88, "step_time": 11.175127660972066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 786.5, "completions/mean_length": 464.078125, "completions/mean_terminated_length": 437.3440856933594, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 1.149987280368805, "epoch": 0.008, "frac_reward_zero_std": 0.25, "grad_norm": 1.236076165142051, "kl": 0.015072842070367187, "learning_rate": 1.5698323748414122e-08, "loss": 0.17, "num_tokens": 1801639.0, "reward": 0.96875, "reward_std": 0.3917950540781021, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2710396274924278, "rewards/format_reward_func/mean": 0.890625, "rewards/format_reward_func/std": 0.31607766449451447, "sampling/importance_sampling_ratio/max": 2.3776625394821167, "sampling/importance_sampling_ratio/mean": 0.9204491376876831, "sampling/importance_sampling_ratio/min": 0.040661390870809555, "sampling/sampling_logp_difference/max": 0.4495445489883423, "sampling/sampling_logp_difference/mean": 0.023287806659936905, "step": 90, "step_time": 11.483706569299102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.5, "completions/max_terminated_length": 940.5, "completions/mean_length": 455.109375, "completions/mean_terminated_length": 455.109375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 1.1072595864534378, "epoch": 0.008177777777777779, "frac_reward_zero_std": 0.5, "grad_norm": 1.0362568226042652, "kl": 0.012551067571621388, "learning_rate": 1.054566895300324e-08, "loss": 0.0268, "num_tokens": 1840558.0, "reward": 0.984375, "reward_std": 0.3083590194582939, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.1480722874403, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.2364606335759163, "sampling/importance_sampling_ratio/max": 2.757253050804138, "sampling/importance_sampling_ratio/mean": 0.8551563322544098, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5598160922527313, "sampling/sampling_logp_difference/mean": 0.023077418096363544, "step": 92, "step_time": 11.143339851987548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 966.5, "completions/max_terminated_length": 867.5, "completions/mean_length": 463.78125, "completions/mean_terminated_length": 455.2777099609375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 1.143265001475811, "epoch": 0.008355555555555555, "frac_reward_zero_std": 0.625, "grad_norm": 0.0025484313004221675, "kl": 0.0112493826309219, "learning_rate": 6.397368838268496e-09, "loss": 0.0941, "num_tokens": 1880016.0, "reward": 0.953125, "reward_std": 0.19507546722888947, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.16800537705421448, "sampling/importance_sampling_ratio/max": 2.5143080949783325, "sampling/importance_sampling_ratio/mean": 0.8339663445949554, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6856141686439514, "sampling/sampling_logp_difference/mean": 0.023781022988259792, "step": 94, "step_time": 11.370905907824636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 998.0, "completions/max_terminated_length": 946.5, "completions/mean_length": 464.390625, "completions/mean_terminated_length": 455.1602783203125, "completions/min_length": 192.5, "completions/min_terminated_length": 192.5, "entropy": 1.1343590430915356, "epoch": 0.008533333333333334, "frac_reward_zero_std": 0.25, "grad_norm": 2.0803027651220756, "kl": 0.01363659166963771, "learning_rate": 3.2708228165273244e-09, "loss": 0.1137, "num_tokens": 1919505.0, "reward": 1.015625, "reward_std": 0.37185215950012207, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2563937231898308, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.2364606335759163, "sampling/importance_sampling_ratio/max": 2.7199188470840454, "sampling/importance_sampling_ratio/mean": 0.7955746948719025, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.569873571395874, "sampling/sampling_logp_difference/mean": 0.0229880353435874, "step": 96, "step_time": 11.320387034327723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 963.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 465.53125, "completions/mean_terminated_length": 456.23638916015625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 1.1051006130874157, "epoch": 0.00871111111111111, "frac_reward_zero_std": 0.5, "grad_norm": 1.38482587569594, "kl": 0.010673947690520436, "learning_rate": 1.1791447083465133e-09, "loss": 0.034, "num_tokens": 1959051.0, "reward": 0.984375, "reward_std": 0.26799365133047104, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.12296734005212784, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21135568618774414, "sampling/importance_sampling_ratio/max": 2.532159209251404, "sampling/importance_sampling_ratio/mean": 0.8075034618377686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4299898147583008, "sampling/sampling_logp_difference/mean": 0.02303027454763651, "step": 98, "step_time": 11.303778560250066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.5, "completions/max_terminated_length": 915.5, "completions/mean_length": 452.484375, "completions/mean_terminated_length": 452.484375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 1.0667904503643513, "epoch": 0.008888888888888889, "frac_reward_zero_std": 0.625, "grad_norm": 0.8448295748463005, "kl": 0.018678127584280446, "learning_rate": 1.3110773862126667e-10, "loss": 0.0456, "num_tokens": 1997698.0, "reward": 1.015625, "reward_std": 0.2710396274924278, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.1480722874403, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.12296734005212784, "sampling/importance_sampling_ratio/max": 2.723245859146118, "sampling/importance_sampling_ratio/mean": 0.8844414949417114, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6081928014755249, "sampling/sampling_logp_difference/mean": 0.02220182679593563, "step": 100, "step_time": 11.397719835629687 }, { "epoch": 0.008888888888888889, "step": 100, "total_flos": 0.0, "train_loss": 0.06438189143314958, "train_runtime": 1344.9766, "train_samples_per_second": 2.379, "train_steps_per_second": 0.074 } ], "logging_steps": 2, "max_steps": 100, "num_input_tokens_seen": 1997698, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }