{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 565, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 262.59375, "completions/mean_terminated_length": 262.59375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2447393834590912, "epoch": 0.0017699115044247787, "frac_reward_zero_std": 0.75, "grad_norm": 1.5595464151479566, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0239, "num_tokens": 26470.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.600738286972046, "sampling/importance_sampling_ratio/mean": 0.9992677569389343, "sampling/importance_sampling_ratio/min": 0.40164387226104736, "sampling/sampling_logp_difference/max": 0.9121894836425781, "sampling/sampling_logp_difference/mean": 0.014379385858774185, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 818.125, "completions/mean_terminated_length": 818.125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.22707729041576385, "epoch": 0.0035398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.4247787610619464e-09, "loss": 0.0, "num_tokens": 88990.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7312631607055664, "sampling/importance_sampling_ratio/mean": 1.0000094175338745, "sampling/importance_sampling_ratio/min": 0.5038022994995117, "sampling/sampling_logp_difference/max": 0.6855714321136475, "sampling/sampling_logp_difference/mean": 0.012125600129365921, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3438.0, "completions/max_terminated_length": 3438.0, "completions/mean_length": 1156.875, "completions/mean_terminated_length": 1156.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.2879234552383423, "epoch": 0.005309734513274336, "frac_reward_zero_std": 0.5, "grad_norm": 0.621693786753474, "kl": 0.0006478896248154342, "learning_rate": 8.849557522123893e-09, "loss": 0.0486, "num_tokens": 176710.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7699054479599, "sampling/importance_sampling_ratio/mean": 1.000090479850769, "sampling/importance_sampling_ratio/min": 0.11653055995702744, "sampling/sampling_logp_difference/max": 2.149601697921753, "sampling/sampling_logp_difference/mean": 0.01389176957309246, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 467.734375, "completions/mean_terminated_length": 467.734375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3299303650856018, "epoch": 0.007079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.8180003694970631, "kl": 0.0006852427031844854, "learning_rate": 1.327433628318584e-08, "loss": 0.0495, "num_tokens": 218885.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003230571746826, "sampling/importance_sampling_ratio/min": 0.292945921421051, "sampling/sampling_logp_difference/max": 1.2277672290802002, "sampling/sampling_logp_difference/mean": 0.015444125048816204, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 433.453125, "completions/mean_terminated_length": 433.453125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.17077282071113586, "epoch": 0.008849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 0.9764477145240158, "kl": 0.0006499545415863395, "learning_rate": 1.7699115044247786e-08, "loss": -0.0156, "num_tokens": 258066.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996541738510132, "sampling/importance_sampling_ratio/min": 0.6060442924499512, "sampling/sampling_logp_difference/max": 1.1632978916168213, "sampling/sampling_logp_difference/mean": 0.011157813481986523, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 392.96875, "completions/mean_terminated_length": 392.96875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.16383656859397888, "epoch": 0.010619469026548672, "frac_reward_zero_std": 0.5, "grad_norm": 1.4558761916855691, "kl": 0.0007315415423363447, "learning_rate": 2.2123893805309735e-08, "loss": 0.0112, "num_tokens": 294112.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008143186569214, "sampling/importance_sampling_ratio/min": 0.36688920855522156, "sampling/sampling_logp_difference/max": 1.0026954412460327, "sampling/sampling_logp_difference/mean": 0.012575799599289894, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 386.09375, "completions/mean_terminated_length": 386.09375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.2359360009431839, "epoch": 0.012389380530973451, "frac_reward_zero_std": 0.5, "grad_norm": 1.2755841013110207, "kl": 0.0007870513945817947, "learning_rate": 2.654867256637168e-08, "loss": -0.0535, "num_tokens": 333334.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8721435070037842, "sampling/importance_sampling_ratio/mean": 1.000199556350708, "sampling/importance_sampling_ratio/min": 0.4781617224216461, "sampling/sampling_logp_difference/max": 0.7378063201904297, "sampling/sampling_logp_difference/mean": 0.014220990240573883, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 286.640625, "completions/mean_terminated_length": 286.640625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.38627898693084717, "epoch": 0.01415929203539823, "frac_reward_zero_std": 0.5, "grad_norm": 1.8363533600375614, "kl": 0.0008938107639551163, "learning_rate": 3.0973451327433626e-08, "loss": 0.0361, "num_tokens": 362655.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8012892007827759, "sampling/importance_sampling_ratio/mean": 1.0002480745315552, "sampling/importance_sampling_ratio/min": 0.42482879757881165, "sampling/sampling_logp_difference/max": 0.8560690879821777, "sampling/sampling_logp_difference/mean": 0.02014569193124771, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.13230177760124207, "epoch": 0.01592920353982301, "frac_reward_zero_std": 1.0, "grad_norm": 0.012042533990998158, "kl": 0.0011251253308728337, "learning_rate": 3.539823008849557e-08, "loss": 0.0, "num_tokens": 386143.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8962767124176025, "sampling/importance_sampling_ratio/mean": 0.9996401071548462, "sampling/importance_sampling_ratio/min": 0.4186244606971741, "sampling/sampling_logp_difference/max": 0.8707810640335083, "sampling/sampling_logp_difference/mean": 0.012253036722540855, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1176.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 408.96875, "completions/mean_terminated_length": 408.96875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.20597517490386963, "epoch": 0.017699115044247787, "frac_reward_zero_std": 0.5, "grad_norm": 1.4611894627130437, "kl": 0.0007102562813088298, "learning_rate": 3.982300884955752e-08, "loss": 0.0506, "num_tokens": 423997.0, "reward": 0.28125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6281522512435913, "sampling/importance_sampling_ratio/mean": 1.0003232955932617, "sampling/importance_sampling_ratio/min": 0.0888504907488823, "sampling/sampling_logp_difference/max": 2.42080020904541, "sampling/sampling_logp_difference/mean": 0.013152622617781162, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 383.109375, "completions/mean_terminated_length": 383.109375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.24951666593551636, "epoch": 0.019469026548672566, "frac_reward_zero_std": 0.75, "grad_norm": 0.9576815857490207, "kl": 0.0005503022694028914, "learning_rate": 4.424778761061947e-08, "loss": -0.0, "num_tokens": 458500.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000211000442505, "sampling/importance_sampling_ratio/min": 0.24836736917495728, "sampling/sampling_logp_difference/max": 1.3928463459014893, "sampling/sampling_logp_difference/mean": 0.01329069398343563, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 387.390625, "completions/mean_terminated_length": 387.390625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.1801387071609497, "epoch": 0.021238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.02991785137611809, "kl": 0.0009193093865178525, "learning_rate": 4.8672566371681415e-08, "loss": 0.0, "num_tokens": 493597.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000786781311035, "sampling/importance_sampling_ratio/min": 0.09502150863409042, "sampling/sampling_logp_difference/max": 2.353652000427246, "sampling/sampling_logp_difference/mean": 0.01282397098839283, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3048.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 556.90625, "completions/mean_terminated_length": 556.90625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.2227586954832077, "epoch": 0.023008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 0.9056433896268239, "kl": 0.0006606071256101131, "learning_rate": 5.309734513274336e-08, "loss": 0.0231, "num_tokens": 540599.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997807741165161, "sampling/importance_sampling_ratio/min": 0.5085330009460449, "sampling/sampling_logp_difference/max": 0.7273645401000977, "sampling/sampling_logp_difference/mean": 0.012460462749004364, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 371.8125, "completions/mean_terminated_length": 371.8125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3668808937072754, "epoch": 0.024778761061946902, "frac_reward_zero_std": 1.0, "grad_norm": 0.005794712959171329, "kl": 0.0006699314690195024, "learning_rate": 5.7522123893805306e-08, "loss": 0.0, "num_tokens": 577851.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.755883812904358, "sampling/importance_sampling_ratio/mean": 0.9999116659164429, "sampling/importance_sampling_ratio/min": 0.22802656888961792, "sampling/sampling_logp_difference/max": 1.4782931804656982, "sampling/sampling_logp_difference/mean": 0.01673159934580326, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3462.0, "completions/max_terminated_length": 3462.0, "completions/mean_length": 695.1875, "completions/mean_terminated_length": 695.1875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.2063179463148117, "epoch": 0.02654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 0.64290960709478, "kl": 0.0006773715140298009, "learning_rate": 6.194690265486725e-08, "loss": -0.0225, "num_tokens": 635543.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000427007675171, "sampling/importance_sampling_ratio/min": 0.29496005177497864, "sampling/sampling_logp_difference/max": 1.2209153175354004, "sampling/sampling_logp_difference/mean": 0.011625310406088829, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 461.609375, "completions/mean_terminated_length": 461.609375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.15815351903438568, "epoch": 0.02831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 0.8293898860703909, "kl": 0.0005979533307254314, "learning_rate": 6.63716814159292e-08, "loss": 0.0093, "num_tokens": 676302.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003345012664795, "sampling/importance_sampling_ratio/min": 0.37565356492996216, "sampling/sampling_logp_difference/max": 1.6588034629821777, "sampling/sampling_logp_difference/mean": 0.011140658520162106, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 930.796875, "completions/mean_terminated_length": 930.796875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.20787391066551208, "epoch": 0.03008849557522124, "frac_reward_zero_std": 0.25, "grad_norm": 0.886145803981697, "kl": 0.00047615449875593185, "learning_rate": 7.079646017699114e-08, "loss": 0.0709, "num_tokens": 746417.0, "reward": 0.4375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001921653747559, "sampling/importance_sampling_ratio/min": 0.23708303272724152, "sampling/sampling_logp_difference/max": 1.439344882965088, "sampling/sampling_logp_difference/mean": 0.011404094286262989, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4045.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 841.34375, "completions/mean_terminated_length": 841.34375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.13792183995246887, "epoch": 0.03185840707964602, "frac_reward_zero_std": 0.5, "grad_norm": 0.7064945396733404, "kl": 0.00036049369373358786, "learning_rate": 7.52212389380531e-08, "loss": 0.0229, "num_tokens": 810647.0, "reward": -0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9877030849456787, "sampling/importance_sampling_ratio/mean": 0.9998019933700562, "sampling/importance_sampling_ratio/min": 0.28008216619491577, "sampling/sampling_logp_difference/max": 1.2726722955703735, "sampling/sampling_logp_difference/mean": 0.00801979098469019, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3158.0, "completions/max_terminated_length": 3158.0, "completions/mean_length": 726.546875, "completions/mean_terminated_length": 726.546875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.18931840360164642, "epoch": 0.033628318584070796, "frac_reward_zero_std": 0.75, "grad_norm": 0.2779996347658486, "kl": 0.0006244736723601818, "learning_rate": 7.964601769911503e-08, "loss": -0.0137, "num_tokens": 868794.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000356674194336, "sampling/importance_sampling_ratio/min": 0.15183261036872864, "sampling/sampling_logp_difference/max": 1.8849766254425049, "sampling/sampling_logp_difference/mean": 0.010994978249073029, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 268.40625, "completions/mean_terminated_length": 268.40625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.1918586790561676, "epoch": 0.035398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 1.690897672422581, "kl": 0.0007604224956594408, "learning_rate": 8.4070796460177e-08, "loss": 0.0117, "num_tokens": 895908.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9094728231430054, "sampling/importance_sampling_ratio/mean": 0.9998387694358826, "sampling/importance_sampling_ratio/min": 0.44529712200164795, "sampling/sampling_logp_difference/max": 0.8090136051177979, "sampling/sampling_logp_difference/mean": 0.012725591659545898, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 383.859375, "completions/mean_terminated_length": 383.859375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2141551971435547, "epoch": 0.03716814159292035, "frac_reward_zero_std": 0.5, "grad_norm": 1.274939211696711, "kl": 0.0006758609088137746, "learning_rate": 8.849557522123894e-08, "loss": 0.0166, "num_tokens": 930827.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001835823059082, "sampling/importance_sampling_ratio/min": 0.41596999764442444, "sampling/sampling_logp_difference/max": 0.8771421909332275, "sampling/sampling_logp_difference/mean": 0.013427350670099258, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 4887.0, "completions/mean_length": 848.6875, "completions/mean_terminated_length": 782.793701171875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.26531967520713806, "epoch": 0.03893805309734513, "frac_reward_zero_std": 0.75, "grad_norm": 0.39330840488484464, "kl": 0.0007933778688311577, "learning_rate": 9.292035398230089e-08, "loss": -0.0005, "num_tokens": 997831.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000817775726318, "sampling/importance_sampling_ratio/min": 0.28262588381767273, "sampling/sampling_logp_difference/max": 1.2636312246322632, "sampling/sampling_logp_difference/mean": 0.014393738470971584, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 310.171875, "completions/mean_terminated_length": 310.171875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2961429953575134, "epoch": 0.04070796460176991, "frac_reward_zero_std": 0.75, "grad_norm": 1.1328112198297104, "kl": 0.0008447502041235566, "learning_rate": 9.734513274336283e-08, "loss": 0.0655, "num_tokens": 1028050.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5744941234588623, "sampling/importance_sampling_ratio/mean": 1.0000591278076172, "sampling/importance_sampling_ratio/min": 0.47894352674484253, "sampling/sampling_logp_difference/max": 0.7361726760864258, "sampling/sampling_logp_difference/mean": 0.0162695050239563, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 555.703125, "completions/mean_terminated_length": 555.703125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.19198361039161682, "epoch": 0.04247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 0.760094823964093, "kl": 0.0008954770746640861, "learning_rate": 1.0176991150442478e-07, "loss": 0.019, "num_tokens": 1074495.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.697697401046753, "sampling/importance_sampling_ratio/mean": 0.9993278384208679, "sampling/importance_sampling_ratio/min": 0.20796842873096466, "sampling/sampling_logp_difference/max": 1.570369005203247, "sampling/sampling_logp_difference/mean": 0.012709707021713257, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 349.921875, "completions/mean_terminated_length": 349.921875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.26390600204467773, "epoch": 0.04424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 0.8604528017064463, "kl": 0.0007257124525494874, "learning_rate": 1.0619469026548672e-07, "loss": 0.0022, "num_tokens": 1109322.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000317096710205, "sampling/importance_sampling_ratio/min": 0.317127525806427, "sampling/sampling_logp_difference/max": 1.148451328277588, "sampling/sampling_logp_difference/mean": 0.014507940039038658, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 605.8125, "completions/mean_terminated_length": 605.8125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.2048119753599167, "epoch": 0.04601769911504425, "frac_reward_zero_std": 0.25, "grad_norm": 1.195772641651234, "kl": 0.0005360489012673497, "learning_rate": 1.1061946902654867e-07, "loss": 0.0918, "num_tokens": 1159310.0, "reward": 0.28125, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997979402542114, "sampling/importance_sampling_ratio/min": 0.36839407682418823, "sampling/sampling_logp_difference/max": 0.9986021518707275, "sampling/sampling_logp_difference/mean": 0.011539291590452194, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 407.15625, "completions/mean_terminated_length": 407.15625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.20068798959255219, "epoch": 0.047787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 0.946855382812821, "kl": 0.0009745345450937748, "learning_rate": 1.1504424778761061e-07, "loss": 0.0009, "num_tokens": 1195448.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007233619689941, "sampling/importance_sampling_ratio/min": 0.3684752285480499, "sampling/sampling_logp_difference/max": 0.9983818531036377, "sampling/sampling_logp_difference/mean": 0.01374245434999466, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 511.0625, "completions/mean_terminated_length": 511.0625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.22395919263362885, "epoch": 0.049557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.020246243636341525, "kl": 0.0011588368797674775, "learning_rate": 1.1946902654867256e-07, "loss": 0.0, "num_tokens": 1239356.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995445609092712, "sampling/importance_sampling_ratio/min": 0.18667632341384888, "sampling/sampling_logp_difference/max": 1.6783790588378906, "sampling/sampling_logp_difference/mean": 0.013424037955701351, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3585.0, "completions/max_terminated_length": 3585.0, "completions/mean_length": 661.421875, "completions/mean_terminated_length": 661.421875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.17851462960243225, "epoch": 0.05132743362831858, "frac_reward_zero_std": 1.0, "grad_norm": 145.73091733393437, "kl": 0.11498642712831497, "learning_rate": 1.238938053097345e-07, "loss": 0.0003, "num_tokens": 1292119.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997677206993103, "sampling/importance_sampling_ratio/min": 1.027458711178042e-05, "sampling/sampling_logp_difference/max": 11.48583698272705, "sampling/sampling_logp_difference/mean": 0.014424338936805725, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 576.84375, "completions/mean_terminated_length": 576.84375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.3114885687828064, "epoch": 0.05309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 0.5741695335769373, "kl": 0.000637779594399035, "learning_rate": 1.2831858407079647e-07, "loss": -0.0048, "num_tokens": 1341117.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9316154718399048, "sampling/importance_sampling_ratio/mean": 0.9997850656509399, "sampling/importance_sampling_ratio/min": 0.49849677085876465, "sampling/sampling_logp_difference/max": 0.6961581707000732, "sampling/sampling_logp_difference/mean": 0.014712383039295673, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1997.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 856.15625, "completions/mean_terminated_length": 856.15625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.17278479039669037, "epoch": 0.05486725663716814, "frac_reward_zero_std": 0.25, "grad_norm": 0.6705832608075616, "kl": 0.0005033203633502126, "learning_rate": 1.327433628318584e-07, "loss": 0.0047, "num_tokens": 1407911.0, "reward": -0.21875, "reward_std": 0.7033873796463013, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.8798589706420898, "sampling/importance_sampling_ratio/mean": 0.9998142719268799, "sampling/importance_sampling_ratio/min": 0.1464228332042694, "sampling/sampling_logp_difference/max": 1.9212566614151, "sampling/sampling_logp_difference/mean": 0.009971060790121555, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 441.875, "completions/mean_terminated_length": 441.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.28321388363838196, "epoch": 0.05663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 0.8537219583707908, "kl": 0.0007523877429775894, "learning_rate": 1.3716814159292035e-07, "loss": -0.0384, "num_tokens": 1448095.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000489592552185, "sampling/importance_sampling_ratio/min": 0.3749425411224365, "sampling/sampling_logp_difference/max": 0.9809825420379639, "sampling/sampling_logp_difference/mean": 0.015449217520654202, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 632.953125, "completions/mean_terminated_length": 632.953125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.21601766347885132, "epoch": 0.0584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 0.7724593995736468, "kl": 0.0006461909506469965, "learning_rate": 1.4159292035398229e-07, "loss": 0.0849, "num_tokens": 1499756.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7502726316452026, "sampling/importance_sampling_ratio/mean": 1.0001163482666016, "sampling/importance_sampling_ratio/min": 0.3271975815296173, "sampling/sampling_logp_difference/max": 1.1171910762786865, "sampling/sampling_logp_difference/mean": 0.01224562432616949, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 5000.0, "completions/max_terminated_length": 4934.0, "completions/mean_length": 989.359375, "completions/mean_terminated_length": 859.9838256835938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.15647917985916138, "epoch": 0.06017699115044248, "frac_reward_zero_std": 0.5, "grad_norm": 0.688824634674777, "kl": 0.0006051391828805208, "learning_rate": 1.4601769911504425e-07, "loss": 0.0886, "num_tokens": 1574547.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001945495605469, "sampling/importance_sampling_ratio/min": 0.07595821470022202, "sampling/sampling_logp_difference/max": 2.5775718688964844, "sampling/sampling_logp_difference/mean": 0.010248854756355286, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 294.8125, "completions/mean_terminated_length": 294.8125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.18555918335914612, "epoch": 0.061946902654867256, "frac_reward_zero_std": 0.75, "grad_norm": 1.4371204021731374, "kl": 0.0008803126984275877, "learning_rate": 1.504424778761062e-07, "loss": -0.0154, "num_tokens": 1606855.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994372725486755, "sampling/importance_sampling_ratio/min": 0.25933316349983215, "sampling/sampling_logp_difference/max": 1.3496416807174683, "sampling/sampling_logp_difference/mean": 0.013926530256867409, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 175.09375, "completions/mean_terminated_length": 175.09375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.15468090772628784, "epoch": 0.06371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 2.2347650690507073, "kl": 0.0012406249297782779, "learning_rate": 1.5486725663716813e-07, "loss": 0.0628, "num_tokens": 1627437.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9164198637008667, "sampling/importance_sampling_ratio/mean": 0.9998393058776855, "sampling/importance_sampling_ratio/min": 0.3409467041492462, "sampling/sampling_logp_difference/max": 1.0760290622711182, "sampling/sampling_logp_difference/mean": 0.015365189872682095, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 350.0625, "completions/mean_terminated_length": 350.0625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2835489511489868, "epoch": 0.06548672566371681, "frac_reward_zero_std": 0.75, "grad_norm": 1.5597864782907842, "kl": 0.0008251734543591738, "learning_rate": 1.5929203539823007e-07, "loss": 0.014, "num_tokens": 1664305.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001027584075928, "sampling/importance_sampling_ratio/min": 0.2961605489253998, "sampling/sampling_logp_difference/max": 1.2168536186218262, "sampling/sampling_logp_difference/mean": 0.01588701829314232, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 307.9375, "completions/mean_terminated_length": 307.9375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2015915811061859, "epoch": 0.06725663716814159, "frac_reward_zero_std": 0.5, "grad_norm": 1.3990133939844236, "kl": 0.0009952208492904902, "learning_rate": 1.6371681415929203e-07, "loss": -0.0664, "num_tokens": 1693821.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.691675066947937, "sampling/importance_sampling_ratio/mean": 0.9992905855178833, "sampling/importance_sampling_ratio/min": 0.3719075620174408, "sampling/sampling_logp_difference/max": 0.989109992980957, "sampling/sampling_logp_difference/mean": 0.014356608502566814, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 240.765625, "completions/mean_terminated_length": 240.765625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.18796539306640625, "epoch": 0.06902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 1.7894718519040298, "kl": 0.0010300553403794765, "learning_rate": 1.68141592920354e-07, "loss": -0.23, "num_tokens": 1718622.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001258850097656, "sampling/importance_sampling_ratio/min": 0.3569100499153137, "sampling/sampling_logp_difference/max": 1.0302715301513672, "sampling/sampling_logp_difference/mean": 0.014103789813816547, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3150.0, "completions/max_terminated_length": 3150.0, "completions/mean_length": 552.484375, "completions/mean_terminated_length": 552.484375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.1429215371608734, "epoch": 0.07079646017699115, "frac_reward_zero_std": 0.5, "grad_norm": 0.9930222846479335, "kl": 0.0005817434866912663, "learning_rate": 1.725663716814159e-07, "loss": -0.0038, "num_tokens": 1765229.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9392271041870117, "sampling/importance_sampling_ratio/mean": 0.9998013377189636, "sampling/importance_sampling_ratio/min": 0.36287516355514526, "sampling/sampling_logp_difference/max": 1.0136964321136475, "sampling/sampling_logp_difference/mean": 0.009485583752393723, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 4451.0, "completions/mean_length": 871.859375, "completions/mean_terminated_length": 806.3333740234375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3113095760345459, "epoch": 0.07256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 0.6985815121362317, "kl": 0.0005374866304919124, "learning_rate": 1.7699115044247788e-07, "loss": 0.1372, "num_tokens": 1834404.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997199773788452, "sampling/importance_sampling_ratio/min": 0.3588027060031891, "sampling/sampling_logp_difference/max": 1.0249826908111572, "sampling/sampling_logp_difference/mean": 0.01366426981985569, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 988.078125, "completions/mean_terminated_length": 924.3968505859375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.22869350016117096, "epoch": 0.0743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 0.5065370139206125, "kl": 0.0008789548883214593, "learning_rate": 1.8141592920353982e-07, "loss": 0.0118, "num_tokens": 1909241.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998897314071655, "sampling/importance_sampling_ratio/min": 0.34038442373275757, "sampling/sampling_logp_difference/max": 1.0776796340942383, "sampling/sampling_logp_difference/mean": 0.013303031213581562, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 552.171875, "completions/mean_terminated_length": 552.171875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2913553714752197, "epoch": 0.07610619469026549, "frac_reward_zero_std": 0.5, "grad_norm": 0.8507661451255459, "kl": 0.0005866308929398656, "learning_rate": 1.8584070796460178e-07, "loss": -0.0183, "num_tokens": 1955332.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8114781379699707, "sampling/importance_sampling_ratio/mean": 0.9995744228363037, "sampling/importance_sampling_ratio/min": 0.4726124703884125, "sampling/sampling_logp_difference/max": 0.7494795322418213, "sampling/sampling_logp_difference/mean": 0.014351906254887581, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 395.828125, "completions/mean_terminated_length": 395.828125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.20017677545547485, "epoch": 0.07787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 0.6978249611184302, "kl": 0.0005770552670583129, "learning_rate": 1.902654867256637e-07, "loss": 0.0089, "num_tokens": 1992633.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.884521484375, "sampling/importance_sampling_ratio/mean": 1.000461220741272, "sampling/importance_sampling_ratio/min": 0.3758199214935303, "sampling/sampling_logp_difference/max": 0.9786452054977417, "sampling/sampling_logp_difference/mean": 0.012053586542606354, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 254.109375, "completions/mean_terminated_length": 254.109375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.12506231665611267, "epoch": 0.07964601769911504, "frac_reward_zero_std": 0.5, "grad_norm": 1.3715419951572747, "kl": 0.0008663627086207271, "learning_rate": 1.9469026548672566e-07, "loss": 0.0234, "num_tokens": 2017952.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7751164436340332, "sampling/importance_sampling_ratio/mean": 0.9999139308929443, "sampling/importance_sampling_ratio/min": 0.48828360438346863, "sampling/sampling_logp_difference/max": 0.7168588638305664, "sampling/sampling_logp_difference/mean": 0.011228635907173157, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 551.96875, "completions/mean_terminated_length": 551.96875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.23611974716186523, "epoch": 0.08141592920353982, "frac_reward_zero_std": 0.5, "grad_norm": 1.0377870026280474, "kl": 0.0006693960167467594, "learning_rate": 1.991150442477876e-07, "loss": -0.0134, "num_tokens": 2064062.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003719329833984, "sampling/importance_sampling_ratio/min": 0.5111001133918762, "sampling/sampling_logp_difference/max": 0.7387268543243408, "sampling/sampling_logp_difference/mean": 0.013654202222824097, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2840.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 848.8125, "completions/mean_terminated_length": 848.8125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.23151327669620514, "epoch": 0.0831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 0.7903114840322307, "kl": 0.0005760455969721079, "learning_rate": 2.0353982300884956e-07, "loss": 0.0568, "num_tokens": 2128978.0, "reward": 0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997749328613281, "sampling/importance_sampling_ratio/min": 0.4756348133087158, "sampling/sampling_logp_difference/max": 0.7431049346923828, "sampling/sampling_logp_difference/mean": 0.011494416743516922, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 404.5625, "completions/mean_terminated_length": 404.5625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.2120853215456009, "epoch": 0.08495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.721630097243313, "kl": 0.0007034215377643704, "learning_rate": 2.0796460176991148e-07, "loss": 0.0267, "num_tokens": 2164822.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9698779582977295, "sampling/importance_sampling_ratio/mean": 0.9994018077850342, "sampling/importance_sampling_ratio/min": 0.254190057516098, "sampling/sampling_logp_difference/max": 1.3696730136871338, "sampling/sampling_logp_difference/mean": 0.012812647968530655, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1839.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 334.0625, "completions/mean_terminated_length": 334.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20998823642730713, "epoch": 0.08672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.007604234504877887, "kl": 0.0007747411727905273, "learning_rate": 2.1238938053097344e-07, "loss": 0.0, "num_tokens": 2196202.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998580813407898, "sampling/importance_sampling_ratio/min": 0.48498210310935974, "sampling/sampling_logp_difference/max": 0.9877235889434814, "sampling/sampling_logp_difference/mean": 0.013541841879487038, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 403.5625, "completions/mean_terminated_length": 403.5625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.22196082770824432, "epoch": 0.08849557522123894, "frac_reward_zero_std": 0.25, "grad_norm": 1.5334853815272902, "kl": 0.0008985198801383376, "learning_rate": 2.1681415929203538e-07, "loss": 0.0034, "num_tokens": 2232510.0, "reward": 0.125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993022084236145, "sampling/importance_sampling_ratio/min": 0.3858293294906616, "sampling/sampling_logp_difference/max": 0.9523601531982422, "sampling/sampling_logp_difference/mean": 0.014577478170394897, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 497.421875, "completions/mean_terminated_length": 497.421875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.19308751821517944, "epoch": 0.09026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 0.6047708643779135, "kl": 0.0007994978222995996, "learning_rate": 2.2123893805309735e-07, "loss": 0.0195, "num_tokens": 2274697.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8980576992034912, "sampling/importance_sampling_ratio/mean": 0.9999922513961792, "sampling/importance_sampling_ratio/min": 0.47910451889038086, "sampling/sampling_logp_difference/max": 0.7358365058898926, "sampling/sampling_logp_difference/mean": 0.012818237766623497, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 306.8125, "completions/mean_terminated_length": 306.8125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2865665555000305, "epoch": 0.0920353982300885, "frac_reward_zero_std": 0.75, "grad_norm": 1.8063704389245203, "kl": 0.0007336522685363889, "learning_rate": 2.2566371681415928e-07, "loss": 0.0816, "num_tokens": 2308653.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6977208852767944, "sampling/importance_sampling_ratio/mean": 1.0000689029693604, "sampling/importance_sampling_ratio/min": 0.37044668197631836, "sampling/sampling_logp_difference/max": 0.9930458068847656, "sampling/sampling_logp_difference/mean": 0.015615614131093025, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 598.09375, "completions/mean_terminated_length": 598.09375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.3492634892463684, "epoch": 0.09380530973451327, "frac_reward_zero_std": 0.75, "grad_norm": 0.5413264379881133, "kl": 0.0006001094589009881, "learning_rate": 2.3008849557522122e-07, "loss": 0.0045, "num_tokens": 2359267.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.847443699836731, "sampling/importance_sampling_ratio/mean": 0.9996719360351562, "sampling/importance_sampling_ratio/min": 0.37462541460990906, "sampling/sampling_logp_difference/max": 0.9818286895751953, "sampling/sampling_logp_difference/mean": 0.01566195674240589, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 435.1875, "completions/mean_terminated_length": 435.1875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.28503769636154175, "epoch": 0.09557522123893805, "frac_reward_zero_std": 0.75, "grad_norm": 0.6471804303735597, "kl": 0.0007370144012384117, "learning_rate": 2.345132743362832e-07, "loss": 0.0179, "num_tokens": 2398735.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.969364881515503, "sampling/importance_sampling_ratio/mean": 0.9994869232177734, "sampling/importance_sampling_ratio/min": 0.2171490490436554, "sampling/sampling_logp_difference/max": 1.5271713733673096, "sampling/sampling_logp_difference/mean": 0.01456733699887991, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 465.515625, "completions/mean_terminated_length": 465.515625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1642337143421173, "epoch": 0.09734513274336283, "frac_reward_zero_std": 0.5, "grad_norm": 1.1082545046625927, "kl": 0.0006122483173385262, "learning_rate": 2.3893805309734513e-07, "loss": -0.0092, "num_tokens": 2439248.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999763369560242, "sampling/importance_sampling_ratio/min": 0.33114874362945557, "sampling/sampling_logp_difference/max": 1.1051876544952393, "sampling/sampling_logp_difference/mean": 0.01028063427656889, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 411.9375, "completions/mean_terminated_length": 411.9375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.1891794204711914, "epoch": 0.09911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.5086095010592302, "kl": 0.0006497810827568173, "learning_rate": 2.4336283185840704e-07, "loss": 0.0686, "num_tokens": 2478428.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999169111251831, "sampling/importance_sampling_ratio/min": 0.4161491394042969, "sampling/sampling_logp_difference/max": 0.8767116069793701, "sampling/sampling_logp_difference/mean": 0.011923547834157944, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 238.046875, "completions/mean_terminated_length": 238.046875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.20001797378063202, "epoch": 0.10088495575221239, "frac_reward_zero_std": 0.75, "grad_norm": 1.6912113100888564, "kl": 0.0008351489086635411, "learning_rate": 2.47787610619469e-07, "loss": 0.0332, "num_tokens": 2503391.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.795984148979187, "sampling/importance_sampling_ratio/mean": 1.0001823902130127, "sampling/importance_sampling_ratio/min": 0.4335770606994629, "sampling/sampling_logp_difference/max": 0.8356857299804688, "sampling/sampling_logp_difference/mean": 0.014987820759415627, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1168.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 417.8125, "completions/mean_terminated_length": 417.8125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.15066847205162048, "epoch": 0.10265486725663717, "frac_reward_zero_std": 0.5, "grad_norm": 1.243226875616891, "kl": 0.0017589430790394545, "learning_rate": 2.5221238938053097e-07, "loss": 0.0144, "num_tokens": 2541075.0, "reward": 0.5625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004302263259888, "sampling/importance_sampling_ratio/min": 0.3906334936618805, "sampling/sampling_logp_difference/max": 0.9399855136871338, "sampling/sampling_logp_difference/mean": 0.012056403793394566, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 567.234375, "completions/mean_terminated_length": 567.234375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.41222190856933594, "epoch": 0.10442477876106195, "frac_reward_zero_std": 0.75, "grad_norm": 0.5946549809486513, "kl": 0.0006937803700566292, "learning_rate": 2.5663716814159294e-07, "loss": -0.0079, "num_tokens": 2591346.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.661634087562561, "sampling/importance_sampling_ratio/mean": 1.0000920295715332, "sampling/importance_sampling_ratio/min": 0.37228742241859436, "sampling/sampling_logp_difference/max": 0.9880890846252441, "sampling/sampling_logp_difference/mean": 0.01795772649347782, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 609.4375, "completions/mean_terminated_length": 609.4375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.23915576934814453, "epoch": 0.10619469026548672, "frac_reward_zero_std": 0.5, "grad_norm": 0.9084410609407709, "kl": 0.0006229827413335443, "learning_rate": 2.6106194690265485e-07, "loss": 0.0337, "num_tokens": 2641086.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997997879981995, "sampling/importance_sampling_ratio/min": 0.31529611349105835, "sampling/sampling_logp_difference/max": 1.4047510623931885, "sampling/sampling_logp_difference/mean": 0.012308374047279358, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 748.671875, "completions/mean_terminated_length": 748.671875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.25317561626434326, "epoch": 0.1079646017699115, "frac_reward_zero_std": 0.5, "grad_norm": 0.8286260165661845, "kl": 0.0006717374781146646, "learning_rate": 2.654867256637168e-07, "loss": -0.0076, "num_tokens": 2702777.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000386238098145, "sampling/importance_sampling_ratio/min": 0.3692662715911865, "sampling/sampling_logp_difference/max": 1.5779037475585938, "sampling/sampling_logp_difference/mean": 0.013244293630123138, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 215.703125, "completions/mean_terminated_length": 215.703125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.17351937294006348, "epoch": 0.10973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 1.7151927663200275, "kl": 0.0008890872122719884, "learning_rate": 2.6991150442477873e-07, "loss": -0.0016, "num_tokens": 2726550.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7325658798217773, "sampling/importance_sampling_ratio/mean": 1.000410795211792, "sampling/importance_sampling_ratio/min": 0.2201519012451172, "sampling/sampling_logp_difference/max": 1.5134375095367432, "sampling/sampling_logp_difference/mean": 0.013493449427187443, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2074.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 362.578125, "completions/mean_terminated_length": 362.578125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.1751943975687027, "epoch": 0.11150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 1.144724600962361, "kl": 0.0006459906580857933, "learning_rate": 2.743362831858407e-07, "loss": -0.0392, "num_tokens": 2760811.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.503711462020874, "sampling/importance_sampling_ratio/mean": 1.0001411437988281, "sampling/importance_sampling_ratio/min": 0.5593056678771973, "sampling/sampling_logp_difference/max": 0.5810590982437134, "sampling/sampling_logp_difference/mean": 0.010295527055859566, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 571.15625, "completions/mean_terminated_length": 571.15625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.290374219417572, "epoch": 0.11327433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 1.023348852353797, "kl": 0.000625660119112581, "learning_rate": 2.787610619469026e-07, "loss": 0.021, "num_tokens": 2809413.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7520549297332764, "sampling/importance_sampling_ratio/mean": 0.9997479915618896, "sampling/importance_sampling_ratio/min": 0.3191077709197998, "sampling/sampling_logp_difference/max": 1.1422264575958252, "sampling/sampling_logp_difference/mean": 0.01507401093840599, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3888.0, "completions/max_terminated_length": 3888.0, "completions/mean_length": 557.828125, "completions/mean_terminated_length": 557.828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.24068549275398254, "epoch": 0.11504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 1.2565904884929364, "kl": 0.0006250995211303234, "learning_rate": 2.8318584070796457e-07, "loss": -0.1062, "num_tokens": 2855194.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8774393796920776, "sampling/importance_sampling_ratio/mean": 0.9996137619018555, "sampling/importance_sampling_ratio/min": 0.2822783589363098, "sampling/sampling_logp_difference/max": 1.2648615837097168, "sampling/sampling_logp_difference/mean": 0.013791462406516075, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 171.796875, "completions/mean_terminated_length": 171.796875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.13566967844963074, "epoch": 0.1168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.014650144266790528, "kl": 0.0010740563739091158, "learning_rate": 2.8761061946902654e-07, "loss": 0.0, "num_tokens": 2875869.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8329052925109863, "sampling/importance_sampling_ratio/mean": 1.000007152557373, "sampling/importance_sampling_ratio/min": 0.3306329846382141, "sampling/sampling_logp_difference/max": 1.1067463159561157, "sampling/sampling_logp_difference/mean": 0.012571284547448158, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 349.109375, "completions/mean_terminated_length": 349.109375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.24395202100276947, "epoch": 0.11858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 1.5187090920829092, "kl": 0.000820152519736439, "learning_rate": 2.920353982300885e-07, "loss": 0.0162, "num_tokens": 2907940.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996822476387024, "sampling/importance_sampling_ratio/min": 0.4968234598636627, "sampling/sampling_logp_difference/max": 0.9919958114624023, "sampling/sampling_logp_difference/mean": 0.014948000200092793, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 442.390625, "completions/mean_terminated_length": 442.390625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.21964870393276215, "epoch": 0.12035398230088495, "frac_reward_zero_std": 0.75, "grad_norm": 1.1034872829469509, "kl": 0.0006966128712520003, "learning_rate": 2.9646017699115047e-07, "loss": -0.0157, "num_tokens": 2945949.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7654799222946167, "sampling/importance_sampling_ratio/mean": 0.9997416138648987, "sampling/importance_sampling_ratio/min": 0.2165154367685318, "sampling/sampling_logp_difference/max": 1.5300934314727783, "sampling/sampling_logp_difference/mean": 0.012757010757923126, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 479.390625, "completions/mean_terminated_length": 479.390625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.22673365473747253, "epoch": 0.12212389380530973, "frac_reward_zero_std": 0.25, "grad_norm": 1.4199490736977556, "kl": 0.0006706893327645957, "learning_rate": 3.008849557522124e-07, "loss": 0.0259, "num_tokens": 2987222.0, "reward": 0.8125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6774829626083374, "sampling/importance_sampling_ratio/mean": 0.9993652701377869, "sampling/importance_sampling_ratio/min": 0.2822788953781128, "sampling/sampling_logp_difference/max": 1.264859676361084, "sampling/sampling_logp_difference/mean": 0.013190586119890213, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 287.453125, "completions/mean_terminated_length": 287.453125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.20361319184303284, "epoch": 0.12389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9443621454965982, "kl": 0.0007153426995500922, "learning_rate": 3.053097345132743e-07, "loss": 0.0076, "num_tokens": 3017475.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997707009315491, "sampling/importance_sampling_ratio/min": 0.4362325370311737, "sampling/sampling_logp_difference/max": 0.8295798301696777, "sampling/sampling_logp_difference/mean": 0.012878675013780594, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 237.96875, "completions/mean_terminated_length": 237.96875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.28585290908813477, "epoch": 0.1256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.007929233251760768, "kl": 0.0009601502097211778, "learning_rate": 3.0973451327433626e-07, "loss": 0.0, "num_tokens": 3042929.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.754317283630371, "sampling/importance_sampling_ratio/mean": 0.9995336532592773, "sampling/importance_sampling_ratio/min": 0.33729657530784607, "sampling/sampling_logp_difference/max": 1.0867927074432373, "sampling/sampling_logp_difference/mean": 0.016787230968475342, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 436.640625, "completions/mean_terminated_length": 436.640625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2524449825286865, "epoch": 0.12743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 1.825534667787052, "kl": 0.0007924112142063677, "learning_rate": 3.141592920353982e-07, "loss": 0.0111, "num_tokens": 3081258.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998154640197754, "sampling/importance_sampling_ratio/min": 0.459695965051651, "sampling/sampling_logp_difference/max": 1.0292840003967285, "sampling/sampling_logp_difference/mean": 0.013040476478636265, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 325.90625, "completions/mean_terminated_length": 325.90625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.11315730214118958, "epoch": 0.12920353982300886, "frac_reward_zero_std": 0.5, "grad_norm": 1.3984641172518533, "kl": 0.0007189067546278238, "learning_rate": 3.1858407079646014e-07, "loss": 0.0021, "num_tokens": 3111844.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6088857650756836, "sampling/importance_sampling_ratio/mean": 0.9999055862426758, "sampling/importance_sampling_ratio/min": 0.40597549080848694, "sampling/sampling_logp_difference/max": 0.9014625549316406, "sampling/sampling_logp_difference/mean": 0.00909288227558136, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 530.453125, "completions/mean_terminated_length": 530.453125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.21332836151123047, "epoch": 0.13097345132743363, "frac_reward_zero_std": 0.5, "grad_norm": 1.1242642769672335, "kl": 0.000585697591304779, "learning_rate": 3.230088495575221e-07, "loss": 0.0268, "num_tokens": 3155777.0, "reward": 0.0625, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002063512802124, "sampling/importance_sampling_ratio/min": 0.3643275797367096, "sampling/sampling_logp_difference/max": 1.4898386001586914, "sampling/sampling_logp_difference/mean": 0.012951428070664406, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2216.0, "completions/max_terminated_length": 2216.0, "completions/mean_length": 484.84375, "completions/mean_terminated_length": 484.84375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.2695532441139221, "epoch": 0.13274336283185842, "frac_reward_zero_std": 0.75, "grad_norm": 1.0694235312800968, "kl": 0.0010280660353600979, "learning_rate": 3.2743362831858407e-07, "loss": -0.0329, "num_tokens": 3197287.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998297691345215, "sampling/importance_sampling_ratio/min": 0.29952558875083923, "sampling/sampling_logp_difference/max": 1.2055554389953613, "sampling/sampling_logp_difference/mean": 0.015185708180069923, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 431.65625, "completions/mean_terminated_length": 431.65625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.17254739999771118, "epoch": 0.13451327433628318, "frac_reward_zero_std": 0.75, "grad_norm": 1.043075787784136, "kl": 0.000818369269836694, "learning_rate": 3.3185840707964603e-07, "loss": -0.0009, "num_tokens": 3236001.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999213218688965, "sampling/importance_sampling_ratio/min": 0.3163447082042694, "sampling/sampling_logp_difference/max": 1.1509227752685547, "sampling/sampling_logp_difference/mean": 0.011583102867007256, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 249.328125, "completions/mean_terminated_length": 249.328125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22489547729492188, "epoch": 0.13628318584070798, "frac_reward_zero_std": 1.0, "grad_norm": 0.009059690028313958, "kl": 0.0009721596143208444, "learning_rate": 3.36283185840708e-07, "loss": 0.0, "num_tokens": 3261270.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5520451068878174, "sampling/importance_sampling_ratio/mean": 0.999252200126648, "sampling/importance_sampling_ratio/min": 0.4397726356983185, "sampling/sampling_logp_difference/max": 0.8214974403381348, "sampling/sampling_logp_difference/mean": 0.015483392402529716, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 292.640625, "completions/mean_terminated_length": 292.640625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.22683309018611908, "epoch": 0.13805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.008915577700038669, "kl": 0.000888474693056196, "learning_rate": 3.4070796460176986e-07, "loss": 0.0, "num_tokens": 3294415.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7769200801849365, "sampling/importance_sampling_ratio/mean": 1.0000243186950684, "sampling/importance_sampling_ratio/min": 0.19193445146083832, "sampling/sampling_logp_difference/max": 1.6506013870239258, "sampling/sampling_logp_difference/mean": 0.014255237765610218, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 415.84375, "completions/mean_terminated_length": 415.84375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.17334607243537903, "epoch": 0.13982300884955753, "frac_reward_zero_std": 1.0, "grad_norm": 0.007475704450083569, "kl": 0.0008850749582052231, "learning_rate": 3.451327433628318e-07, "loss": 0.0, "num_tokens": 3331157.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000443696975708, "sampling/importance_sampling_ratio/min": 0.3152752220630646, "sampling/sampling_logp_difference/max": 1.1543092727661133, "sampling/sampling_logp_difference/mean": 0.011908318847417831, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 338.34375, "completions/mean_terminated_length": 338.34375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.26490890979766846, "epoch": 0.1415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.2740888410443463, "kl": 0.0009336735820397735, "learning_rate": 3.495575221238938e-07, "loss": 0.0033, "num_tokens": 3365323.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995031952857971, "sampling/importance_sampling_ratio/min": 0.5823237299919128, "sampling/sampling_logp_difference/max": 0.9607470035552979, "sampling/sampling_logp_difference/mean": 0.015547128394246101, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 551.15625, "completions/mean_terminated_length": 551.15625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.21206071972846985, "epoch": 0.1433628318584071, "frac_reward_zero_std": 0.25, "grad_norm": 0.9455540070144629, "kl": 0.0011630583321675658, "learning_rate": 3.5398230088495575e-07, "loss": 0.0302, "num_tokens": 3411877.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6143853664398193, "sampling/importance_sampling_ratio/mean": 0.9994951486587524, "sampling/importance_sampling_ratio/min": 0.4782344698905945, "sampling/sampling_logp_difference/max": 0.7376542091369629, "sampling/sampling_logp_difference/mean": 0.012303833849728107, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 351.765625, "completions/mean_terminated_length": 351.765625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.18017110228538513, "epoch": 0.14513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 0.8191903637291607, "kl": 0.0006687000277452171, "learning_rate": 3.5840707964601767e-07, "loss": -0.0146, "num_tokens": 3446822.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 0.999599814414978, "sampling/importance_sampling_ratio/min": 0.38535386323928833, "sampling/sampling_logp_difference/max": 0.9535932540893555, "sampling/sampling_logp_difference/mean": 0.011458972468972206, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 393.171875, "completions/mean_terminated_length": 393.171875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2238372266292572, "epoch": 0.14690265486725665, "frac_reward_zero_std": 0.75, "grad_norm": 1.1111800569453678, "kl": 0.0011417513014748693, "learning_rate": 3.6283185840707963e-07, "loss": 0.1133, "num_tokens": 3484161.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998175501823425, "sampling/importance_sampling_ratio/min": 0.2505515217781067, "sampling/sampling_logp_difference/max": 1.384090781211853, "sampling/sampling_logp_difference/mean": 0.012754121795296669, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 526.59375, "completions/mean_terminated_length": 526.59375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.24529121816158295, "epoch": 0.1486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 0.9093994124510995, "kl": 0.0007546417182311416, "learning_rate": 3.672566371681416e-07, "loss": -0.0049, "num_tokens": 3528999.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003714561462402, "sampling/importance_sampling_ratio/min": 0.3014521896839142, "sampling/sampling_logp_difference/max": 1.199143886566162, "sampling/sampling_logp_difference/mean": 0.012965135276317596, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 702.90625, "completions/mean_terminated_length": 702.90625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.23195774853229523, "epoch": 0.1504424778761062, "frac_reward_zero_std": 0.25, "grad_norm": 0.8826618181186215, "kl": 0.000703387544490397, "learning_rate": 3.7168141592920356e-07, "loss": 0.0063, "num_tokens": 3584609.0, "reward": 0.15625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999510645866394, "sampling/importance_sampling_ratio/min": 0.5260633230209351, "sampling/sampling_logp_difference/max": 0.7454228401184082, "sampling/sampling_logp_difference/mean": 0.012401481159031391, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 273.4375, "completions/mean_terminated_length": 273.4375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1699049025774002, "epoch": 0.15221238938053097, "frac_reward_zero_std": 0.75, "grad_norm": 1.263621441069793, "kl": 0.0009017777629196644, "learning_rate": 3.761061946902654e-07, "loss": -0.0011, "num_tokens": 3612237.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.752611756324768, "sampling/importance_sampling_ratio/mean": 1.0004552602767944, "sampling/importance_sampling_ratio/min": 0.3842329680919647, "sampling/sampling_logp_difference/max": 0.9565062522888184, "sampling/sampling_logp_difference/mean": 0.013492307625710964, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 354.609375, "completions/mean_terminated_length": 354.609375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.18146733939647675, "epoch": 0.15398230088495576, "frac_reward_zero_std": 0.5, "grad_norm": 1.4418957521153861, "kl": 0.0008802321972325444, "learning_rate": 3.805309734513274e-07, "loss": 0.0727, "num_tokens": 3644756.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999129772186279, "sampling/importance_sampling_ratio/min": 0.5329746007919312, "sampling/sampling_logp_difference/max": 0.7572096586227417, "sampling/sampling_logp_difference/mean": 0.01333337090909481, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 336.671875, "completions/mean_terminated_length": 336.671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2430601567029953, "epoch": 0.15575221238938053, "frac_reward_zero_std": 0.5, "grad_norm": 5.604506240976347, "kl": 0.000949793728068471, "learning_rate": 3.8495575221238935e-07, "loss": 0.0301, "num_tokens": 3677199.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993593096733093, "sampling/importance_sampling_ratio/min": 0.3052644729614258, "sampling/sampling_logp_difference/max": 1.1865768432617188, "sampling/sampling_logp_difference/mean": 0.016456013545393944, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2125.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 277.359375, "completions/mean_terminated_length": 277.359375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15917323529720306, "epoch": 0.15752212389380532, "frac_reward_zero_std": 1.0, "grad_norm": 0.008665508136286715, "kl": 0.0006423669401556253, "learning_rate": 3.893805309734513e-07, "loss": 0.0, "num_tokens": 3705078.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000389814376831, "sampling/importance_sampling_ratio/min": 0.29754090309143066, "sampling/sampling_logp_difference/max": 1.2122036218643188, "sampling/sampling_logp_difference/mean": 0.011544113978743553, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 306.203125, "completions/mean_terminated_length": 306.203125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.22574955224990845, "epoch": 0.1592920353982301, "frac_reward_zero_std": 0.5, "grad_norm": 1.4488217483003243, "kl": 0.0009163507493212819, "learning_rate": 3.938053097345133e-07, "loss": -0.0345, "num_tokens": 3738371.0, "reward": -0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6147265434265137, "sampling/importance_sampling_ratio/mean": 0.9999816417694092, "sampling/importance_sampling_ratio/min": 0.3758661150932312, "sampling/sampling_logp_difference/max": 0.9785223007202148, "sampling/sampling_logp_difference/mean": 0.014325115829706192, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 677.015625, "completions/mean_terminated_length": 677.015625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3318648934364319, "epoch": 0.16106194690265488, "frac_reward_zero_std": 0.75, "grad_norm": 0.5481839878352643, "kl": 0.0006326149450615048, "learning_rate": 3.982300884955752e-07, "loss": 0.0461, "num_tokens": 3798132.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999533891677856, "sampling/importance_sampling_ratio/min": 0.44408610463142395, "sampling/sampling_logp_difference/max": 0.8117368221282959, "sampling/sampling_logp_difference/mean": 0.01463503297418356, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3974.0, "completions/max_terminated_length": 3974.0, "completions/mean_length": 889.421875, "completions/mean_terminated_length": 889.421875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.18088531494140625, "epoch": 0.16283185840707964, "frac_reward_zero_std": 0.25, "grad_norm": 0.8760414815541836, "kl": 0.0005927161546424031, "learning_rate": 4.0265486725663716e-07, "loss": 0.1158, "num_tokens": 3868415.0, "reward": 0.3125, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000123977661133, "sampling/importance_sampling_ratio/min": 0.31737804412841797, "sampling/sampling_logp_difference/max": 1.1476616859436035, "sampling/sampling_logp_difference/mean": 0.01050669513642788, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 363.9375, "completions/mean_terminated_length": 363.9375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.19485661387443542, "epoch": 0.16460176991150444, "frac_reward_zero_std": 0.5, "grad_norm": 1.4941165562888763, "kl": 0.0012931008823215961, "learning_rate": 4.0707964601769913e-07, "loss": -0.0627, "num_tokens": 3902763.0, "reward": 0.40625, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992245435714722, "sampling/importance_sampling_ratio/min": 0.40574905276298523, "sampling/sampling_logp_difference/max": 0.9020204544067383, "sampling/sampling_logp_difference/mean": 0.015026378445327282, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 269.609375, "completions/mean_terminated_length": 269.609375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.23403656482696533, "epoch": 0.1663716814159292, "frac_reward_zero_std": 1.0, "grad_norm": 0.013725204913860065, "kl": 0.0012350908946245909, "learning_rate": 4.1150442477876104e-07, "loss": 0.0, "num_tokens": 3929874.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.648939847946167, "sampling/importance_sampling_ratio/mean": 1.0002479553222656, "sampling/importance_sampling_ratio/min": 0.323281466960907, "sampling/sampling_logp_difference/max": 1.1292319297790527, "sampling/sampling_logp_difference/mean": 0.014906010590493679, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 611.859375, "completions/mean_terminated_length": 611.859375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.31611254811286926, "epoch": 0.168141592920354, "frac_reward_zero_std": 0.25, "grad_norm": 1.1094521166006825, "kl": 0.0008185418555513024, "learning_rate": 4.1592920353982295e-07, "loss": -0.0076, "num_tokens": 3981209.0, "reward": 0.84375, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6463165283203125, "sampling/importance_sampling_ratio/mean": 0.9997063279151917, "sampling/importance_sampling_ratio/min": 0.2800421416759491, "sampling/sampling_logp_difference/max": 1.2728151082992554, "sampling/sampling_logp_difference/mean": 0.015651041641831398, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 403.828125, "completions/mean_terminated_length": 403.828125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.14483006298542023, "epoch": 0.16991150442477876, "frac_reward_zero_std": 0.5, "grad_norm": 1.3186611449083738, "kl": 0.0007074165041558444, "learning_rate": 4.203539823008849e-07, "loss": 0.1098, "num_tokens": 4016494.0, "reward": 0.09375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8135946989059448, "sampling/importance_sampling_ratio/mean": 0.9997037053108215, "sampling/importance_sampling_ratio/min": 0.4462626874446869, "sampling/sampling_logp_difference/max": 0.8068475723266602, "sampling/sampling_logp_difference/mean": 0.009387536905705929, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 268.171875, "completions/mean_terminated_length": 268.171875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.1764664202928543, "epoch": 0.17168141592920355, "frac_reward_zero_std": 0.5, "grad_norm": 1.6990801406285923, "kl": 0.0009431242942810059, "learning_rate": 4.247787610619469e-07, "loss": 0.0182, "num_tokens": 4042649.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8056833744049072, "sampling/importance_sampling_ratio/mean": 1.000286340713501, "sampling/importance_sampling_ratio/min": 0.14961208403110504, "sampling/sampling_logp_difference/max": 1.8997094631195068, "sampling/sampling_logp_difference/mean": 0.01237904466688633, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 601.328125, "completions/mean_terminated_length": 601.328125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.31892770528793335, "epoch": 0.17345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 0.9788787641076008, "kl": 0.0008473815396428108, "learning_rate": 4.2920353982300885e-07, "loss": -0.0097, "num_tokens": 4094030.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8055801391601562, "sampling/importance_sampling_ratio/mean": 1.0000791549682617, "sampling/importance_sampling_ratio/min": 0.3906334936618805, "sampling/sampling_logp_difference/max": 0.9399855136871338, "sampling/sampling_logp_difference/mean": 0.015339460223913193, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 375.90625, "completions/mean_terminated_length": 375.90625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.20001444220542908, "epoch": 0.1752212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 1.2433020182207413, "kl": 0.0011453324696049094, "learning_rate": 4.3362831858407076e-07, "loss": 0.0025, "num_tokens": 4129848.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994040727615356, "sampling/importance_sampling_ratio/min": 0.46054357290267944, "sampling/sampling_logp_difference/max": 0.7753478288650513, "sampling/sampling_logp_difference/mean": 0.013390690088272095, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2613.0, "completions/max_terminated_length": 2613.0, "completions/mean_length": 409.28125, "completions/mean_terminated_length": 409.28125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.22055740654468536, "epoch": 0.17699115044247787, "frac_reward_zero_std": 1.0, "grad_norm": 0.013863443807471981, "kl": 0.0015936002600938082, "learning_rate": 4.380530973451327e-07, "loss": 0.0, "num_tokens": 4171434.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999790191650391, "sampling/importance_sampling_ratio/min": 0.30521613359451294, "sampling/sampling_logp_difference/max": 1.1867351531982422, "sampling/sampling_logp_difference/mean": 0.016347704455256462, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 832.515625, "completions/mean_terminated_length": 832.515625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.3073654770851135, "epoch": 0.17876106194690267, "frac_reward_zero_std": 0.75, "grad_norm": 0.6130323114450102, "kl": 0.0007668838370591402, "learning_rate": 4.424778761061947e-07, "loss": -0.0365, "num_tokens": 4236075.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999839186668396, "sampling/importance_sampling_ratio/min": 0.32089194655418396, "sampling/sampling_logp_difference/max": 3.018908739089966, "sampling/sampling_logp_difference/mean": 0.013598902150988579, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 498.59375, "completions/mean_terminated_length": 498.59375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.1965767741203308, "epoch": 0.18053097345132743, "frac_reward_zero_std": 0.25, "grad_norm": 1.258223230345156, "kl": 0.000957610085606575, "learning_rate": 4.469026548672566e-07, "loss": -0.0155, "num_tokens": 4277473.0, "reward": 0.125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.646347165107727, "sampling/importance_sampling_ratio/mean": 1.0000998973846436, "sampling/importance_sampling_ratio/min": 0.37227314710617065, "sampling/sampling_logp_difference/max": 0.9881274700164795, "sampling/sampling_logp_difference/mean": 0.011962941847741604, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 192.328125, "completions/mean_terminated_length": 192.328125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2148052304983139, "epoch": 0.18230088495575222, "frac_reward_zero_std": 0.75, "grad_norm": 1.448346125084078, "kl": 0.000932082359213382, "learning_rate": 4.5132743362831857e-07, "loss": 0.0129, "num_tokens": 4300454.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7256935834884644, "sampling/importance_sampling_ratio/mean": 1.0000355243682861, "sampling/importance_sampling_ratio/min": 0.5098016262054443, "sampling/sampling_logp_difference/max": 0.6737335920333862, "sampling/sampling_logp_difference/mean": 0.01414698176085949, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2597.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 387.59375, "completions/mean_terminated_length": 387.59375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.2227656990289688, "epoch": 0.184070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 1.021778639250057, "kl": 0.0009607725660316646, "learning_rate": 4.557522123893805e-07, "loss": 0.0239, "num_tokens": 4335052.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.7507588863372803, "sampling/importance_sampling_ratio/mean": 0.9998894333839417, "sampling/importance_sampling_ratio/min": 0.40205395221710205, "sampling/sampling_logp_difference/max": 0.9111690521240234, "sampling/sampling_logp_difference/mean": 0.013367307372391224, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 566.734375, "completions/mean_terminated_length": 566.734375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.30700552463531494, "epoch": 0.18584070796460178, "frac_reward_zero_std": 0.5, "grad_norm": 0.8155960060741572, "kl": 0.0007343885954469442, "learning_rate": 4.6017699115044245e-07, "loss": -0.0369, "num_tokens": 4386331.0, "reward": 0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.8474441766738892, "sampling/importance_sampling_ratio/mean": 1.0004281997680664, "sampling/importance_sampling_ratio/min": 0.38971108198165894, "sampling/sampling_logp_difference/max": 0.9423496127128601, "sampling/sampling_logp_difference/mean": 0.01432417519390583, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4127.0, "completions/max_terminated_length": 4127.0, "completions/mean_length": 882.828125, "completions/mean_terminated_length": 882.828125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3231211304664612, "epoch": 0.18761061946902655, "frac_reward_zero_std": 0.75, "grad_norm": 0.5093231607852426, "kl": 0.0010395616991445422, "learning_rate": 4.646017699115044e-07, "loss": 0.0399, "num_tokens": 4455968.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6550264358520508, "sampling/importance_sampling_ratio/mean": 0.9997809529304504, "sampling/importance_sampling_ratio/min": 0.3647230863571167, "sampling/sampling_logp_difference/max": 1.0086169242858887, "sampling/sampling_logp_difference/mean": 0.015093929134309292, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 393.703125, "completions/mean_terminated_length": 393.703125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.20166689157485962, "epoch": 0.18938053097345134, "frac_reward_zero_std": 0.75, "grad_norm": 0.8573668838413586, "kl": 0.000771928287576884, "learning_rate": 4.690265486725664e-07, "loss": -0.0396, "num_tokens": 4492909.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.7676914930343628, "sampling/importance_sampling_ratio/mean": 0.9999195337295532, "sampling/importance_sampling_ratio/min": 0.5096383094787598, "sampling/sampling_logp_difference/max": 0.674053966999054, "sampling/sampling_logp_difference/mean": 0.01164277084171772, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 576.625, "completions/mean_terminated_length": 576.625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.18424072861671448, "epoch": 0.1911504424778761, "frac_reward_zero_std": 0.25, "grad_norm": 1.215391681256017, "kl": 0.0007743529858998954, "learning_rate": 4.734513274336283e-07, "loss": 0.0437, "num_tokens": 4540709.0, "reward": 0.78125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.8675520420074463, "sampling/importance_sampling_ratio/mean": 0.9997106790542603, "sampling/importance_sampling_ratio/min": 0.18618011474609375, "sampling/sampling_logp_difference/max": 1.6810407638549805, "sampling/sampling_logp_difference/mean": 0.011773797683417797, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 444.546875, "completions/mean_terminated_length": 444.546875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.24579569697380066, "epoch": 0.1929203539823009, "frac_reward_zero_std": 0.5, "grad_norm": 1.1173591838667396, "kl": 0.0009365356527268887, "learning_rate": 4.778761061946903e-07, "loss": 0.0398, "num_tokens": 4579688.0, "reward": 0.09375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.7433515787124634, "sampling/importance_sampling_ratio/mean": 0.9996940493583679, "sampling/importance_sampling_ratio/min": 0.30852165818214417, "sampling/sampling_logp_difference/max": 1.1759631633758545, "sampling/sampling_logp_difference/mean": 0.01403791643679142, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 682.453125, "completions/mean_terminated_length": 682.453125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.13279536366462708, "epoch": 0.19469026548672566, "frac_reward_zero_std": 0.5, "grad_norm": 0.8085087951744054, "kl": 0.001161522581242025, "learning_rate": 4.823008849557521e-07, "loss": 0.0385, "num_tokens": 4633493.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.880149006843567, "sampling/importance_sampling_ratio/mean": 0.9999022483825684, "sampling/importance_sampling_ratio/min": 0.13973769545555115, "sampling/sampling_logp_difference/max": 1.9679882526397705, "sampling/sampling_logp_difference/mean": 0.010621700435876846, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 651.203125, "completions/mean_terminated_length": 651.203125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.25171321630477905, "epoch": 0.19646017699115045, "frac_reward_zero_std": 0.25, "grad_norm": 1.238596569851713, "kl": 0.0008315599407069385, "learning_rate": 4.867256637168141e-07, "loss": 0.0097, "num_tokens": 4689266.0, "reward": -0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8835588693618774, "sampling/importance_sampling_ratio/mean": 0.9994617700576782, "sampling/importance_sampling_ratio/min": 0.2191912680864334, "sampling/sampling_logp_difference/max": 1.517810583114624, "sampling/sampling_logp_difference/mean": 0.013609644025564194, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 430.3125, "completions/mean_terminated_length": 430.3125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.24077115952968597, "epoch": 0.19823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 1.1268698853400243, "kl": 0.0011600591242313385, "learning_rate": 4.91150442477876e-07, "loss": -0.0861, "num_tokens": 4735382.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000234842300415, "sampling/importance_sampling_ratio/min": 0.1480797976255417, "sampling/sampling_logp_difference/max": 1.9100040197372437, "sampling/sampling_logp_difference/mean": 0.014914069324731827, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 389.015625, "completions/mean_terminated_length": 389.015625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.23051947355270386, "epoch": 0.2, "frac_reward_zero_std": 0.75, "grad_norm": 0.9871111011770654, "kl": 0.0009966930374503136, "learning_rate": 4.95575221238938e-07, "loss": 0.017, "num_tokens": 4774631.0, "reward": -0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5760064125061035, "sampling/importance_sampling_ratio/mean": 0.9996190071105957, "sampling/importance_sampling_ratio/min": 0.19423316419124603, "sampling/sampling_logp_difference/max": 1.6386959552764893, "sampling/sampling_logp_difference/mean": 0.014792338013648987, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 285.34375, "completions/mean_terminated_length": 285.34375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.23119781911373138, "epoch": 0.20176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.4810959314443315, "kl": 0.001130079384893179, "learning_rate": 5e-07, "loss": 0.0023, "num_tokens": 4805725.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000246286392212, "sampling/importance_sampling_ratio/min": 0.4000387191772461, "sampling/sampling_logp_difference/max": 0.916193962097168, "sampling/sampling_logp_difference/mean": 0.014414334669709206, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 445.90625, "completions/mean_terminated_length": 445.90625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.20531050860881805, "epoch": 0.20353982300884957, "frac_reward_zero_std": 0.5, "grad_norm": 1.5339806872182415, "kl": 0.0013238202081993222, "learning_rate": 5.044247787610619e-07, "loss": 0.0482, "num_tokens": 4845367.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000162959098816, "sampling/importance_sampling_ratio/min": 0.3527084290981293, "sampling/sampling_logp_difference/max": 1.0421135425567627, "sampling/sampling_logp_difference/mean": 0.013347551226615906, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1971.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 521.765625, "completions/mean_terminated_length": 521.765625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.30813366174697876, "epoch": 0.20530973451327433, "frac_reward_zero_std": 0.0, "grad_norm": 1.6040293089050963, "kl": 0.001264748745597899, "learning_rate": 5.088495575221239e-07, "loss": -0.0006, "num_tokens": 4893176.0, "reward": 0.5625, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8693820238113403, "sampling/importance_sampling_ratio/mean": 1.0001510381698608, "sampling/importance_sampling_ratio/min": 0.254190057516098, "sampling/sampling_logp_difference/max": 1.3696730136871338, "sampling/sampling_logp_difference/mean": 0.016262924298644066, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 653.640625, "completions/mean_terminated_length": 653.640625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.19307413697242737, "epoch": 0.20707964601769913, "frac_reward_zero_std": 0.5, "grad_norm": 0.9139596148900515, "kl": 0.0007146270363591611, "learning_rate": 5.132743362831859e-07, "loss": 0.0103, "num_tokens": 4946705.0, "reward": 0.03125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000927448272705, "sampling/importance_sampling_ratio/min": 0.1184450089931488, "sampling/sampling_logp_difference/max": 2.1333065032958984, "sampling/sampling_logp_difference/mean": 0.011228544637560844, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 561.734375, "completions/mean_terminated_length": 561.734375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.20774435997009277, "epoch": 0.2088495575221239, "frac_reward_zero_std": 0.75, "grad_norm": 0.599153733918498, "kl": 0.001053126878105104, "learning_rate": 5.176991150442478e-07, "loss": 0.0301, "num_tokens": 4993584.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6283549070358276, "sampling/importance_sampling_ratio/mean": 1.0000214576721191, "sampling/importance_sampling_ratio/min": 0.40129029750823975, "sampling/sampling_logp_difference/max": 0.9130702018737793, "sampling/sampling_logp_difference/mean": 0.01271618902683258, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 433.359375, "completions/mean_terminated_length": 433.359375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.17372867465019226, "epoch": 0.21061946902654868, "frac_reward_zero_std": 0.75, "grad_norm": 1.0947750257417554, "kl": 0.0016448991373181343, "learning_rate": 5.221238938053097e-07, "loss": 0.1261, "num_tokens": 5030535.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996919631958008, "sampling/importance_sampling_ratio/min": 0.17111872136592865, "sampling/sampling_logp_difference/max": 1.7653976678848267, "sampling/sampling_logp_difference/mean": 0.013387931510806084, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 596.953125, "completions/mean_terminated_length": 596.953125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.2903949022293091, "epoch": 0.21238938053097345, "frac_reward_zero_std": 0.25, "grad_norm": 1.1417698065347255, "kl": 0.0008459882810711861, "learning_rate": 5.265486725663717e-07, "loss": -0.0976, "num_tokens": 5080596.0, "reward": 0.3125, "reward_std": 0.6525881886482239, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.86408269405365, "sampling/importance_sampling_ratio/mean": 0.9996562004089355, "sampling/importance_sampling_ratio/min": 0.3969646096229553, "sampling/sampling_logp_difference/max": 0.9239082336425781, "sampling/sampling_logp_difference/mean": 0.014434611424803734, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4154.0, "completions/max_terminated_length": 4154.0, "completions/mean_length": 630.640625, "completions/mean_terminated_length": 630.640625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.22384238243103027, "epoch": 0.21415929203539824, "frac_reward_zero_std": 0.75, "grad_norm": 0.5688223400654816, "kl": 0.0011210363591089845, "learning_rate": 5.309734513274336e-07, "loss": 0.0467, "num_tokens": 5135069.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005090236663818, "sampling/importance_sampling_ratio/min": 0.18063604831695557, "sampling/sampling_logp_difference/max": 1.711271047592163, "sampling/sampling_logp_difference/mean": 0.01375030167400837, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 335.5625, "completions/mean_terminated_length": 335.5625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.17122839391231537, "epoch": 0.215929203539823, "frac_reward_zero_std": 0.25, "grad_norm": 1.8663976738598715, "kl": 0.001356611493974924, "learning_rate": 5.353982300884956e-07, "loss": 0.047, "num_tokens": 5169473.0, "reward": 0.03125, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000332593917847, "sampling/importance_sampling_ratio/min": 0.27826112508773804, "sampling/sampling_logp_difference/max": 1.2791953086853027, "sampling/sampling_logp_difference/mean": 0.012417839840054512, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 200.234375, "completions/mean_terminated_length": 200.234375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.14779315888881683, "epoch": 0.2176991150442478, "frac_reward_zero_std": 1.0, "grad_norm": 0.01945464167761527, "kl": 0.001425448339432478, "learning_rate": 5.398230088495575e-07, "loss": 0.0, "num_tokens": 5191920.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8800947666168213, "sampling/importance_sampling_ratio/mean": 1.0004053115844727, "sampling/importance_sampling_ratio/min": 0.35932207107543945, "sampling/sampling_logp_difference/max": 1.023536205291748, "sampling/sampling_logp_difference/mean": 0.012222045101225376, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.11597263067960739, "epoch": 0.21946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.012593511823444398, "kl": 0.0010812411783263087, "learning_rate": 5.442477876106194e-07, "loss": 0.0, "num_tokens": 5214424.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9007587432861328, "sampling/importance_sampling_ratio/mean": 1.000012993812561, "sampling/importance_sampling_ratio/min": 0.24108436703681946, "sampling/sampling_logp_difference/max": 1.4226083755493164, "sampling/sampling_logp_difference/mean": 0.011629834771156311, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 486.234375, "completions/mean_terminated_length": 486.234375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.15629956126213074, "epoch": 0.22123893805309736, "frac_reward_zero_std": 0.75, "grad_norm": 0.8628896619300983, "kl": 0.0008848098223097622, "learning_rate": 5.486725663716814e-07, "loss": -0.0208, "num_tokens": 5255415.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000648498535156, "sampling/importance_sampling_ratio/min": 0.600456953048706, "sampling/sampling_logp_difference/max": 0.9509494304656982, "sampling/sampling_logp_difference/mean": 0.010130148380994797, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 484.65625, "completions/mean_terminated_length": 484.65625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.13430476188659668, "epoch": 0.22300884955752212, "frac_reward_zero_std": 0.5, "grad_norm": 1.0601781574559874, "kl": 0.001086753443814814, "learning_rate": 5.530973451327434e-07, "loss": 0.0557, "num_tokens": 5296113.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999409317970276, "sampling/importance_sampling_ratio/min": 0.3153219223022461, "sampling/sampling_logp_difference/max": 1.227081537246704, "sampling/sampling_logp_difference/mean": 0.009715702384710312, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.1639169454574585, "epoch": 0.2247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 2.1764613698957085, "kl": 0.0016570291481912136, "learning_rate": 5.575221238938052e-07, "loss": 0.1318, "num_tokens": 5324857.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8481957912445068, "sampling/importance_sampling_ratio/mean": 1.0006189346313477, "sampling/importance_sampling_ratio/min": 0.30428364872932434, "sampling/sampling_logp_difference/max": 1.1897950172424316, "sampling/sampling_logp_difference/mean": 0.01152008306235075, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 525.484375, "completions/mean_terminated_length": 525.484375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.17199614644050598, "epoch": 0.22654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 0.8548585163722257, "kl": 0.0012140540638938546, "learning_rate": 5.619469026548672e-07, "loss": -0.0124, "num_tokens": 5369832.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.8492265939712524, "sampling/importance_sampling_ratio/mean": 0.9992969036102295, "sampling/importance_sampling_ratio/min": 0.30253133177757263, "sampling/sampling_logp_difference/max": 1.195570468902588, "sampling/sampling_logp_difference/mean": 0.012387236580252647, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 633.6875, "completions/mean_terminated_length": 633.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.16110070049762726, "epoch": 0.22831858407079647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8589017639253999, "kl": 0.0010817721486091614, "learning_rate": 5.663716814159291e-07, "loss": -0.0118, "num_tokens": 5421540.0, "reward": 0.46875, "reward_std": 0.5143726468086243, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002080202102661, "sampling/importance_sampling_ratio/min": 0.1871904581785202, "sampling/sampling_logp_difference/max": 1.675628662109375, "sampling/sampling_logp_difference/mean": 0.009875915944576263, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3448.0, "completions/max_terminated_length": 3448.0, "completions/mean_length": 652.84375, "completions/mean_terminated_length": 652.84375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.19036948680877686, "epoch": 0.23008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 0.5417529661112552, "kl": 0.0009122653864324093, "learning_rate": 5.707964601769911e-07, "loss": 0.0063, "num_tokens": 5473690.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998577237129211, "sampling/importance_sampling_ratio/min": 0.3358350992202759, "sampling/sampling_logp_difference/max": 1.091135025024414, "sampling/sampling_logp_difference/mean": 0.011601174250245094, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 298.171875, "completions/mean_terminated_length": 298.171875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.15611150860786438, "epoch": 0.23185840707964603, "frac_reward_zero_std": 0.75, "grad_norm": 1.3022368521283376, "kl": 0.0017910553142428398, "learning_rate": 5.752212389380531e-07, "loss": -0.1091, "num_tokens": 5502485.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000312328338623, "sampling/importance_sampling_ratio/min": 0.12477816641330719, "sampling/sampling_logp_difference/max": 2.0812177658081055, "sampling/sampling_logp_difference/mean": 0.012516917660832405, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 502.609375, "completions/mean_terminated_length": 502.609375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.28236153721809387, "epoch": 0.2336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.012741491711618966, "kl": 0.0012938741128891706, "learning_rate": 5.79646017699115e-07, "loss": 0.0, "num_tokens": 5546700.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8180153369903564, "sampling/importance_sampling_ratio/mean": 1.0001598596572876, "sampling/importance_sampling_ratio/min": 0.4764645993709564, "sampling/sampling_logp_difference/max": 0.7413618564605713, "sampling/sampling_logp_difference/mean": 0.014386776834726334, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 368.078125, "completions/mean_terminated_length": 368.078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2475200742483139, "epoch": 0.23539823008849559, "frac_reward_zero_std": 0.75, "grad_norm": 1.1406165871542426, "kl": 0.0008853284525685012, "learning_rate": 5.84070796460177e-07, "loss": 0.0576, "num_tokens": 5583985.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6977168321609497, "sampling/importance_sampling_ratio/mean": 0.9996869564056396, "sampling/importance_sampling_ratio/min": 0.6162475943565369, "sampling/sampling_logp_difference/max": 0.5292843580245972, "sampling/sampling_logp_difference/mean": 0.012121238745748997, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3138.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 512.140625, "completions/mean_terminated_length": 512.140625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.23217667639255524, "epoch": 0.23716814159292035, "frac_reward_zero_std": 1.0, "grad_norm": 0.027872855897225747, "kl": 0.0010980168590322137, "learning_rate": 5.88495575221239e-07, "loss": 0.0, "num_tokens": 5628570.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.629650592803955, "sampling/importance_sampling_ratio/mean": 1.0001583099365234, "sampling/importance_sampling_ratio/min": 0.5102017521858215, "sampling/sampling_logp_difference/max": 0.6729490756988525, "sampling/sampling_logp_difference/mean": 0.010724234394729137, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 435.3125, "completions/mean_terminated_length": 435.3125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.1887589991092682, "epoch": 0.23893805309734514, "frac_reward_zero_std": 0.75, "grad_norm": 9.998483795552128, "kl": 0.015115800313651562, "learning_rate": 5.929203539823009e-07, "loss": 0.0345, "num_tokens": 5667710.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9937227964401245, "sampling/importance_sampling_ratio/mean": 1.0000815391540527, "sampling/importance_sampling_ratio/min": 0.0008686393848620355, "sampling/sampling_logp_difference/max": 7.048582553863525, "sampling/sampling_logp_difference/mean": 0.013089445419609547, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.1401287317276001, "epoch": 0.2407079646017699, "frac_reward_zero_std": 1.0, "grad_norm": 0.017239327299716993, "kl": 0.0012948656221851707, "learning_rate": 5.973451327433628e-07, "loss": 0.0, "num_tokens": 5693966.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 0.9992884397506714, "sampling/importance_sampling_ratio/min": 0.4899369478225708, "sampling/sampling_logp_difference/max": 0.7134785652160645, "sampling/sampling_logp_difference/mean": 0.011936522088944912, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 628.6875, "completions/mean_terminated_length": 628.6875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2261354625225067, "epoch": 0.2424778761061947, "frac_reward_zero_std": 0.25, "grad_norm": 1.2308124954657063, "kl": 0.001205621985718608, "learning_rate": 6.017699115044248e-07, "loss": -0.0464, "num_tokens": 5744234.0, "reward": 0.40625, "reward_std": 0.5986068248748779, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005642175674438, "sampling/importance_sampling_ratio/min": 0.20108090341091156, "sampling/sampling_logp_difference/max": 1.6040480136871338, "sampling/sampling_logp_difference/mean": 0.013065203092992306, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3983.0, "completions/max_terminated_length": 3983.0, "completions/mean_length": 452.140625, "completions/mean_terminated_length": 452.140625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1468367576599121, "epoch": 0.24424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 1.7019858577102491, "kl": 0.001276213675737381, "learning_rate": 6.061946902654867e-07, "loss": -0.0743, "num_tokens": 5784995.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00044846534729, "sampling/importance_sampling_ratio/min": 0.4086010456085205, "sampling/sampling_logp_difference/max": 1.0081098079681396, "sampling/sampling_logp_difference/mean": 0.011268842034041882, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3278.0, "completions/max_terminated_length": 3278.0, "completions/mean_length": 451.90625, "completions/mean_terminated_length": 451.90625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1322592794895172, "epoch": 0.24601769911504426, "frac_reward_zero_std": 0.75, "grad_norm": 0.681249395771476, "kl": 0.0012429154012352228, "learning_rate": 6.106194690265486e-07, "loss": 0.1284, "num_tokens": 5823965.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996567964553833, "sampling/importance_sampling_ratio/min": 0.43966370820999146, "sampling/sampling_logp_difference/max": 0.8217451572418213, "sampling/sampling_logp_difference/mean": 0.009683975949883461, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3712.0, "completions/max_terminated_length": 3712.0, "completions/mean_length": 600.46875, "completions/mean_terminated_length": 600.46875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.18576234579086304, "epoch": 0.24778761061946902, "frac_reward_zero_std": 1.0, "grad_norm": 0.012437932169589947, "kl": 0.0015970903914421797, "learning_rate": 6.150442477876105e-07, "loss": 0.0, "num_tokens": 5872155.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001471042633057, "sampling/importance_sampling_ratio/min": 0.1942329853773117, "sampling/sampling_logp_difference/max": 1.6386969089508057, "sampling/sampling_logp_difference/mean": 0.012365680187940598, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 594.4375, "completions/mean_terminated_length": 594.4375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2772740125656128, "epoch": 0.24955752212389382, "frac_reward_zero_std": 0.5, "grad_norm": 0.9695997417214254, "kl": 0.001695783226750791, "learning_rate": 6.194690265486725e-07, "loss": -0.0465, "num_tokens": 5920311.0, "reward": 0.21875, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997611045837402, "sampling/importance_sampling_ratio/min": 0.3981722295284271, "sampling/sampling_logp_difference/max": 0.9208706617355347, "sampling/sampling_logp_difference/mean": 0.014532621949911118, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 633.21875, "completions/mean_terminated_length": 633.21875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.3401173949241638, "epoch": 0.2513274336283186, "frac_reward_zero_std": 0.5, "grad_norm": 0.7997179875423444, "kl": 0.0013617710210382938, "learning_rate": 6.238938053097345e-07, "loss": 0.0241, "num_tokens": 5973253.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6322418451309204, "sampling/importance_sampling_ratio/mean": 0.999986469745636, "sampling/importance_sampling_ratio/min": 0.3530274033546448, "sampling/sampling_logp_difference/max": 1.0412095785140991, "sampling/sampling_logp_difference/mean": 0.016559287905693054, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 292.703125, "completions/mean_terminated_length": 292.703125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.1716068834066391, "epoch": 0.25309734513274335, "frac_reward_zero_std": 0.75, "grad_norm": 1.4403969249177426, "kl": 0.0011674973648041487, "learning_rate": 6.283185840707964e-07, "loss": -0.1592, "num_tokens": 6002018.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.939858078956604, "sampling/importance_sampling_ratio/mean": 1.0000256299972534, "sampling/importance_sampling_ratio/min": 0.4005155563354492, "sampling/sampling_logp_difference/max": 0.915002703666687, "sampling/sampling_logp_difference/mean": 0.011238176375627518, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 318.40625, "completions/mean_terminated_length": 318.40625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.20271864533424377, "epoch": 0.25486725663716814, "frac_reward_zero_std": 0.5, "grad_norm": 1.5141891163677066, "kl": 0.0019937718752771616, "learning_rate": 6.327433628318584e-07, "loss": -0.0155, "num_tokens": 6030876.0, "reward": -0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6478666067123413, "sampling/importance_sampling_ratio/mean": 0.9997513294219971, "sampling/importance_sampling_ratio/min": 0.3789888620376587, "sampling/sampling_logp_difference/max": 0.9702484607696533, "sampling/sampling_logp_difference/mean": 0.013580543920397758, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 327.390625, "completions/mean_terminated_length": 327.390625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.16514067351818085, "epoch": 0.25663716814159293, "frac_reward_zero_std": 0.75, "grad_norm": 1.175993132527975, "kl": 0.0018191073322668672, "learning_rate": 6.371681415929203e-07, "loss": 0.052, "num_tokens": 6062245.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998762607574463, "sampling/importance_sampling_ratio/min": 0.1707066148519516, "sampling/sampling_logp_difference/max": 1.7678089141845703, "sampling/sampling_logp_difference/mean": 0.01231642346829176, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 256.453125, "completions/mean_terminated_length": 256.453125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.23827554285526276, "epoch": 0.2584070796460177, "frac_reward_zero_std": 0.5, "grad_norm": 2.376842887543031, "kl": 0.002266048453748226, "learning_rate": 6.415929203539822e-07, "loss": 0.146, "num_tokens": 6091170.0, "reward": 0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004348754882812, "sampling/importance_sampling_ratio/min": 0.3929966390132904, "sampling/sampling_logp_difference/max": 1.000737190246582, "sampling/sampling_logp_difference/mean": 0.016682550311088562, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 467.1875, "completions/mean_terminated_length": 467.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.22012630105018616, "epoch": 0.26017699115044246, "frac_reward_zero_std": 0.5, "grad_norm": 1.0401191826220277, "kl": 0.0015268856659531593, "learning_rate": 6.460176991150442e-07, "loss": 0.0862, "num_tokens": 6138158.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999053478240967, "sampling/importance_sampling_ratio/min": 0.40633997321128845, "sampling/sampling_logp_difference/max": 1.1271946430206299, "sampling/sampling_logp_difference/mean": 0.01353778038173914, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3327.0, "completions/max_terminated_length": 3327.0, "completions/mean_length": 722.546875, "completions/mean_terminated_length": 722.546875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.2738112211227417, "epoch": 0.26194690265486725, "frac_reward_zero_std": 0.25, "grad_norm": 1.1166242373633863, "kl": 0.0015998302260413766, "learning_rate": 6.504424778761062e-07, "loss": -0.2254, "num_tokens": 6197281.0, "reward": 0.3125, "reward_std": 0.5847553014755249, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999792575836182, "sampling/importance_sampling_ratio/min": 0.23739440739154816, "sampling/sampling_logp_difference/max": 1.4380323886871338, "sampling/sampling_logp_difference/mean": 0.013761061243712902, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 371.234375, "completions/mean_terminated_length": 371.234375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.18916088342666626, "epoch": 0.26371681415929205, "frac_reward_zero_std": 0.5, "grad_norm": 2.075255952011213, "kl": 0.001520076533779502, "learning_rate": 6.548672566371681e-07, "loss": 0.0178, "num_tokens": 6231616.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001811981201172, "sampling/importance_sampling_ratio/min": 0.47560635209083557, "sampling/sampling_logp_difference/max": 0.7505574226379395, "sampling/sampling_logp_difference/mean": 0.011506482027471066, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 697.03125, "completions/mean_terminated_length": 697.03125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.19323575496673584, "epoch": 0.26548672566371684, "frac_reward_zero_std": 0.25, "grad_norm": 1.0544898356517713, "kl": 0.001579253003001213, "learning_rate": 6.592920353982301e-07, "loss": 0.1294, "num_tokens": 6288802.0, "reward": 0.65625, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999755620956421, "sampling/importance_sampling_ratio/min": 0.435323566198349, "sampling/sampling_logp_difference/max": 1.1894290447235107, "sampling/sampling_logp_difference/mean": 0.011228494346141815, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2371.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 546.75, "completions/mean_terminated_length": 546.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.24357496201992035, "epoch": 0.2672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.04553171073858, "kl": 0.0013180330861359835, "learning_rate": 6.637168141592921e-07, "loss": 0.0, "num_tokens": 6334178.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999897837638855, "sampling/importance_sampling_ratio/min": 0.00772626930847764, "sampling/sampling_logp_difference/max": 4.863129138946533, "sampling/sampling_logp_difference/mean": 0.0129692442715168, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 344.078125, "completions/mean_terminated_length": 344.078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.25318944454193115, "epoch": 0.26902654867256637, "frac_reward_zero_std": 0.5, "grad_norm": 1.4899535004911064, "kl": 0.0017984609585255384, "learning_rate": 6.68141592920354e-07, "loss": -0.0403, "num_tokens": 6367911.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000117301940918, "sampling/importance_sampling_ratio/min": 0.39126816391944885, "sampling/sampling_logp_difference/max": 0.9383621215820312, "sampling/sampling_logp_difference/mean": 0.014593098312616348, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 718.078125, "completions/mean_terminated_length": 718.078125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.19968779385089874, "epoch": 0.27079646017699116, "frac_reward_zero_std": 0.25, "grad_norm": 1.08463255800823, "kl": 0.0013808272778987885, "learning_rate": 6.72566371681416e-07, "loss": 0.0244, "num_tokens": 6426572.0, "reward": 0.4375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000115156173706, "sampling/importance_sampling_ratio/min": 0.3346235752105713, "sampling/sampling_logp_difference/max": 1.0947489738464355, "sampling/sampling_logp_difference/mean": 0.01158864889293909, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 807.53125, "completions/mean_terminated_length": 807.53125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.2077145129442215, "epoch": 0.27256637168141595, "frac_reward_zero_std": 0.25, "grad_norm": 0.9006854057592405, "kl": 0.0012506239581853151, "learning_rate": 6.769911504424779e-07, "loss": 0.0205, "num_tokens": 6491422.0, "reward": 0.40625, "reward_std": 0.676956295967102, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 0.9996528625488281, "sampling/importance_sampling_ratio/min": 0.39604607224464417, "sampling/sampling_logp_difference/max": 0.9262247085571289, "sampling/sampling_logp_difference/mean": 0.011283830739557743, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 317.09375, "completions/mean_terminated_length": 317.09375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.20301923155784607, "epoch": 0.2743362831858407, "frac_reward_zero_std": 0.25, "grad_norm": 2.2750542755642065, "kl": 0.0021835805382579565, "learning_rate": 6.814159292035397e-07, "loss": 0.0671, "num_tokens": 6521204.0, "reward": 0.46875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9876788854599, "sampling/importance_sampling_ratio/mean": 1.0007274150848389, "sampling/importance_sampling_ratio/min": 0.4601758122444153, "sampling/sampling_logp_difference/max": 0.776146650314331, "sampling/sampling_logp_difference/mean": 0.014119230210781097, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 538.03125, "completions/mean_terminated_length": 538.03125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.23711633682250977, "epoch": 0.2761061946902655, "frac_reward_zero_std": 0.5, "grad_norm": 0.9157112148472482, "kl": 0.0015474995598196983, "learning_rate": 6.858407079646017e-07, "loss": 0.0379, "num_tokens": 6566486.0, "reward": -0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997793436050415, "sampling/importance_sampling_ratio/min": 0.48101890087127686, "sampling/sampling_logp_difference/max": 0.898421049118042, "sampling/sampling_logp_difference/mean": 0.012340676970779896, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 434.59375, "completions/mean_terminated_length": 434.59375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.28111714124679565, "epoch": 0.2778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 0.9958473904546873, "kl": 0.0016564913094043732, "learning_rate": 6.902654867256636e-07, "loss": -0.0723, "num_tokens": 6606476.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8417409658432007, "sampling/importance_sampling_ratio/mean": 1.0000019073486328, "sampling/importance_sampling_ratio/min": 0.32853180170059204, "sampling/sampling_logp_difference/max": 1.1131216287612915, "sampling/sampling_logp_difference/mean": 0.014261005446314812, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 454.03125, "completions/mean_terminated_length": 454.03125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.17274272441864014, "epoch": 0.27964601769911507, "frac_reward_zero_std": 0.25, "grad_norm": 1.5625069886032132, "kl": 0.0014737172750756145, "learning_rate": 6.946902654867256e-07, "loss": 0.0879, "num_tokens": 6646238.0, "reward": 0.6875, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.9879258871078491, "sampling/importance_sampling_ratio/mean": 0.9997170567512512, "sampling/importance_sampling_ratio/min": 0.31634554266929626, "sampling/sampling_logp_difference/max": 1.1509201526641846, "sampling/sampling_logp_difference/mean": 0.010963189415633678, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 434.15625, "completions/mean_terminated_length": 434.15625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.30112990736961365, "epoch": 0.2814159292035398, "frac_reward_zero_std": 0.5, "grad_norm": 0.9835946965478544, "kl": 0.0013485566014423966, "learning_rate": 6.991150442477876e-07, "loss": -0.0317, "num_tokens": 6685080.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6846824884414673, "sampling/importance_sampling_ratio/mean": 1.0002710819244385, "sampling/importance_sampling_ratio/min": 0.5038020610809326, "sampling/sampling_logp_difference/max": 0.6855719089508057, "sampling/sampling_logp_difference/mean": 0.014277322217822075, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 236.234375, "completions/mean_terminated_length": 236.234375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.34300661087036133, "epoch": 0.2831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 1.215459805621387, "kl": 0.002192323561757803, "learning_rate": 7.035398230088495e-07, "loss": 0.0099, "num_tokens": 6711607.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999967813491821, "sampling/importance_sampling_ratio/min": 0.5460063219070435, "sampling/sampling_logp_difference/max": 0.872962474822998, "sampling/sampling_logp_difference/mean": 0.01824253983795643, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 480.625, "completions/mean_terminated_length": 480.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.25046253204345703, "epoch": 0.2849557522123894, "frac_reward_zero_std": 0.5, "grad_norm": 1.1680527328880923, "kl": 0.0016994598554447293, "learning_rate": 7.079646017699115e-07, "loss": 0.1014, "num_tokens": 6753055.0, "reward": 0.59375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6088902950286865, "sampling/importance_sampling_ratio/mean": 0.9998564720153809, "sampling/importance_sampling_ratio/min": 0.32852938771247864, "sampling/sampling_logp_difference/max": 1.1131290197372437, "sampling/sampling_logp_difference/mean": 0.013483662158250809, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 617.390625, "completions/mean_terminated_length": 617.390625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.1582278311252594, "epoch": 0.2867256637168142, "frac_reward_zero_std": 1.0, "grad_norm": 0.015904350135180594, "kl": 0.0015389202162623405, "learning_rate": 7.123893805309734e-07, "loss": 0.0, "num_tokens": 6803512.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999871015548706, "sampling/importance_sampling_ratio/min": 0.1630757600069046, "sampling/sampling_logp_difference/max": 1.8135404586791992, "sampling/sampling_logp_difference/mean": 0.011236883699893951, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 641.4375, "completions/mean_terminated_length": 641.4375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.18636351823806763, "epoch": 0.2884955752212389, "frac_reward_zero_std": 0.5, "grad_norm": 0.9960395238481153, "kl": 0.0012530905660241842, "learning_rate": 7.168141592920353e-07, "loss": 0.1732, "num_tokens": 6857508.0, "reward": 0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8587719202041626, "sampling/importance_sampling_ratio/mean": 1.0000814199447632, "sampling/importance_sampling_ratio/min": 0.19480688869953156, "sampling/sampling_logp_difference/max": 1.6357464790344238, "sampling/sampling_logp_difference/mean": 0.010559570975601673, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 5000.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 667.0, "completions/mean_terminated_length": 527.2257690429688, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.1355215609073639, "epoch": 0.2902654867256637, "frac_reward_zero_std": 0.5, "grad_norm": 0.7892101003638963, "kl": 0.0010617325315251946, "learning_rate": 7.212389380530973e-07, "loss": 0.2793, "num_tokens": 6909428.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6056840419769287, "sampling/importance_sampling_ratio/mean": 1.0001463890075684, "sampling/importance_sampling_ratio/min": 0.604169487953186, "sampling/sampling_logp_difference/max": 0.5039005279541016, "sampling/sampling_logp_difference/mean": 0.008925529196858406, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.3091489374637604, "epoch": 0.2920353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 1.5933646861972175, "kl": 0.002190070692449808, "learning_rate": 7.256637168141593e-07, "loss": 0.1237, "num_tokens": 6939740.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997755289077759, "sampling/importance_sampling_ratio/min": 0.5540900230407715, "sampling/sampling_logp_difference/max": 1.1345248222351074, "sampling/sampling_logp_difference/mean": 0.01612423174083233, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3559.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 483.0, "completions/mean_terminated_length": 483.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.2685645818710327, "epoch": 0.2938053097345133, "frac_reward_zero_std": 0.75, "grad_norm": 0.8441386115925541, "kl": 0.002231655642390251, "learning_rate": 7.300884955752212e-07, "loss": -0.0644, "num_tokens": 6982284.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7481248378753662, "sampling/importance_sampling_ratio/mean": 1.0004262924194336, "sampling/importance_sampling_ratio/min": 0.39861005544662476, "sampling/sampling_logp_difference/max": 0.919771671295166, "sampling/sampling_logp_difference/mean": 0.012786097824573517, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 602.8125, "completions/mean_terminated_length": 602.8125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.14699102938175201, "epoch": 0.29557522123893804, "frac_reward_zero_std": 0.5, "grad_norm": 0.8625227681253774, "kl": 0.0013770672958344221, "learning_rate": 7.345132743362832e-07, "loss": 0.0424, "num_tokens": 7030480.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.958638310432434, "sampling/importance_sampling_ratio/mean": 0.9996402263641357, "sampling/importance_sampling_ratio/min": 0.3134310841560364, "sampling/sampling_logp_difference/max": 1.1601758003234863, "sampling/sampling_logp_difference/mean": 0.009145397692918777, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 395.390625, "completions/mean_terminated_length": 395.390625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.18308311700820923, "epoch": 0.2973451327433628, "frac_reward_zero_std": 0.5, "grad_norm": 1.661186778441191, "kl": 0.002992842346429825, "learning_rate": 7.389380530973452e-07, "loss": 0.1092, "num_tokens": 7065417.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6185369491577148, "sampling/importance_sampling_ratio/mean": 1.0004504919052124, "sampling/importance_sampling_ratio/min": 0.47913265228271484, "sampling/sampling_logp_difference/max": 0.7357778549194336, "sampling/sampling_logp_difference/mean": 0.012548895552754402, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 205.234375, "completions/mean_terminated_length": 205.234375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2058759480714798, "epoch": 0.2991150442477876, "frac_reward_zero_std": 1.0, "grad_norm": 0.05423445624660464, "kl": 0.0035194961819797754, "learning_rate": 7.433628318584071e-07, "loss": 0.0, "num_tokens": 7089080.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00025475025177, "sampling/importance_sampling_ratio/min": 0.3125382363796234, "sampling/sampling_logp_difference/max": 1.1630284786224365, "sampling/sampling_logp_difference/mean": 0.01277989149093628, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 554.203125, "completions/mean_terminated_length": 554.203125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.30327367782592773, "epoch": 0.3008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 0.8749551554096359, "kl": 0.003365672891959548, "learning_rate": 7.477876106194691e-07, "loss": -0.0237, "num_tokens": 7138661.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8332279920578003, "sampling/importance_sampling_ratio/mean": 1.0001275539398193, "sampling/importance_sampling_ratio/min": 0.4821145832538605, "sampling/sampling_logp_difference/max": 0.7295734882354736, "sampling/sampling_logp_difference/mean": 0.014116200618445873, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 409.421875, "completions/mean_terminated_length": 409.421875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.2373097836971283, "epoch": 0.30265486725663715, "frac_reward_zero_std": 0.75, "grad_norm": 0.9209701948583413, "kl": 0.0026466373819857836, "learning_rate": 7.522123893805308e-07, "loss": -0.0249, "num_tokens": 7180544.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7598704099655151, "sampling/importance_sampling_ratio/mean": 1.0002739429473877, "sampling/importance_sampling_ratio/min": 0.09319400042295456, "sampling/sampling_logp_difference/max": 2.3730719089508057, "sampling/sampling_logp_difference/mean": 0.013561483472585678, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 421.921875, "completions/mean_terminated_length": 421.921875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20718565583229065, "epoch": 0.30442477876106194, "frac_reward_zero_std": 0.25, "grad_norm": 1.2938845102672456, "kl": 0.004027503542602062, "learning_rate": 7.566371681415928e-07, "loss": 0.072, "num_tokens": 7223339.0, "reward": 0.375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998713731765747, "sampling/importance_sampling_ratio/min": 0.43162885308265686, "sampling/sampling_logp_difference/max": 0.9713039398193359, "sampling/sampling_logp_difference/mean": 0.01408354565501213, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 179.203125, "completions/mean_terminated_length": 179.203125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.1343413144350052, "epoch": 0.30619469026548674, "frac_reward_zero_std": 1.0, "grad_norm": 0.03870840756168, "kl": 0.002666051499545574, "learning_rate": 7.610619469026548e-07, "loss": 0.0, "num_tokens": 7243912.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005197525024414, "sampling/importance_sampling_ratio/min": 0.4161675274372101, "sampling/sampling_logp_difference/max": 0.8766673803329468, "sampling/sampling_logp_difference/mean": 0.011971770785748959, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 391.84375, "completions/mean_terminated_length": 391.84375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.24773770570755005, "epoch": 0.30796460176991153, "frac_reward_zero_std": 0.75, "grad_norm": 1.0162971473202225, "kl": 0.0018944480689242482, "learning_rate": 7.654867256637167e-07, "loss": -0.0086, "num_tokens": 7279182.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.81521475315094, "sampling/importance_sampling_ratio/mean": 1.0001611709594727, "sampling/importance_sampling_ratio/min": 0.5546994209289551, "sampling/sampling_logp_difference/max": 0.5962038040161133, "sampling/sampling_logp_difference/mean": 0.013876217417418957, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 259.234375, "completions/mean_terminated_length": 259.234375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.12422733008861542, "epoch": 0.30973451327433627, "frac_reward_zero_std": 0.75, "grad_norm": 1.5435306226527223, "kl": 0.002644417341798544, "learning_rate": 7.699115044247787e-07, "loss": 0.1465, "num_tokens": 7303709.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4606136083602905, "sampling/importance_sampling_ratio/mean": 0.9999499917030334, "sampling/importance_sampling_ratio/min": 0.44167324900627136, "sampling/sampling_logp_difference/max": 0.8171849250793457, "sampling/sampling_logp_difference/mean": 0.009831338189542294, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2334.0, "completions/max_terminated_length": 2334.0, "completions/mean_length": 482.734375, "completions/mean_terminated_length": 482.734375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.26821938157081604, "epoch": 0.31150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 1.085912841560708, "kl": 0.0022123786620795727, "learning_rate": 7.743362831858407e-07, "loss": -0.0463, "num_tokens": 7346220.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8181695938110352, "sampling/importance_sampling_ratio/mean": 0.9999743700027466, "sampling/importance_sampling_ratio/min": 0.4236069917678833, "sampling/sampling_logp_difference/max": 0.8589491844177246, "sampling/sampling_logp_difference/mean": 0.012869884259998798, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 228.8125, "completions/mean_terminated_length": 228.8125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.17372316122055054, "epoch": 0.31327433628318585, "frac_reward_zero_std": 0.5, "grad_norm": 2.4335621657573108, "kl": 0.002545151161029935, "learning_rate": 7.787610619469026e-07, "loss": 0.0529, "num_tokens": 7373040.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996956586837769, "sampling/importance_sampling_ratio/min": 0.43335384130477905, "sampling/sampling_logp_difference/max": 0.8362007141113281, "sampling/sampling_logp_difference/mean": 0.013754649087786674, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 550.71875, "completions/mean_terminated_length": 550.71875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.282806396484375, "epoch": 0.31504424778761064, "frac_reward_zero_std": 0.5, "grad_norm": 1.1804620442276592, "kl": 0.0038575464859604836, "learning_rate": 7.831858407079646e-07, "loss": -0.0567, "num_tokens": 7418638.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.8780512809753418, "sampling/importance_sampling_ratio/mean": 0.9995101094245911, "sampling/importance_sampling_ratio/min": 0.4161491394042969, "sampling/sampling_logp_difference/max": 0.8767116069793701, "sampling/sampling_logp_difference/mean": 0.014844506978988647, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 360.140625, "completions/mean_terminated_length": 360.140625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.13333824276924133, "epoch": 0.3168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0193146511806456, "kl": 0.0018906821496784687, "learning_rate": 7.876106194690266e-07, "loss": 0.0, "num_tokens": 7452215.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999326467514038, "sampling/importance_sampling_ratio/min": 0.41653361916542053, "sampling/sampling_logp_difference/max": 0.875788152217865, "sampling/sampling_logp_difference/mean": 0.010643227025866508, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 380.84375, "completions/mean_terminated_length": 380.84375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.26458853483200073, "epoch": 0.3185840707964602, "frac_reward_zero_std": 0.25, "grad_norm": 1.3644806608496978, "kl": 0.001673161517828703, "learning_rate": 7.920353982300884e-07, "loss": -0.0369, "num_tokens": 7488701.0, "reward": 0.25, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.7170958518981934, "sampling/importance_sampling_ratio/mean": 1.0002449750900269, "sampling/importance_sampling_ratio/min": 0.4308478832244873, "sampling/sampling_logp_difference/max": 0.8420002460479736, "sampling/sampling_logp_difference/mean": 0.014070939272642136, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 417.15625, "completions/mean_terminated_length": 417.15625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.211936816573143, "epoch": 0.32035398230088497, "frac_reward_zero_std": 0.5, "grad_norm": 1.1886699050045932, "kl": 0.0022995867766439915, "learning_rate": 7.964601769911504e-07, "loss": 0.0786, "num_tokens": 7524599.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997132420539856, "sampling/importance_sampling_ratio/min": 0.20032647252082825, "sampling/sampling_logp_difference/max": 1.607806921005249, "sampling/sampling_logp_difference/mean": 0.01486996840685606, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2499.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 613.84375, "completions/mean_terminated_length": 613.84375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.23976843059062958, "epoch": 0.32212389380530976, "frac_reward_zero_std": 0.0, "grad_norm": 1.2408278484880753, "kl": 0.0020338715985417366, "learning_rate": 8.008849557522124e-07, "loss": -0.1481, "num_tokens": 7576605.0, "reward": 0.25, "reward_std": 0.6707825064659119, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6977006196975708, "sampling/importance_sampling_ratio/mean": 0.9999163150787354, "sampling/importance_sampling_ratio/min": 0.3682785928249359, "sampling/sampling_logp_difference/max": 0.9989156723022461, "sampling/sampling_logp_difference/mean": 0.013010426424443722, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 354.921875, "completions/mean_terminated_length": 354.921875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.1405547857284546, "epoch": 0.3238938053097345, "frac_reward_zero_std": 0.75, "grad_norm": 1.0805096260653977, "kl": 0.0031493990682065487, "learning_rate": 8.053097345132743e-07, "loss": -0.0791, "num_tokens": 7610120.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997509121894836, "sampling/importance_sampling_ratio/min": 0.005257735028862953, "sampling/sampling_logp_difference/max": 5.2480549812316895, "sampling/sampling_logp_difference/mean": 0.010545278899371624, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 289.765625, "completions/mean_terminated_length": 289.765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1653425097465515, "epoch": 0.3256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.2440935261452013, "kl": 0.0021059922873973846, "learning_rate": 8.097345132743363e-07, "loss": 0.0685, "num_tokens": 7638889.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996868371963501, "sampling/importance_sampling_ratio/min": 0.44640055298805237, "sampling/sampling_logp_difference/max": 0.8757655620574951, "sampling/sampling_logp_difference/mean": 0.01179768517613411, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.22807544469833374, "epoch": 0.3274336283185841, "frac_reward_zero_std": 0.75, "grad_norm": 1.121496792374849, "kl": 0.002439849078655243, "learning_rate": 8.141592920353983e-07, "loss": -0.0039, "num_tokens": 7671033.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 0.999803364276886, "sampling/importance_sampling_ratio/min": 0.20600874722003937, "sampling/sampling_logp_difference/max": 1.5798366069793701, "sampling/sampling_logp_difference/mean": 0.013862546533346176, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 205.765625, "completions/mean_terminated_length": 205.765625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.16033005714416504, "epoch": 0.3292035398230089, "frac_reward_zero_std": 0.75, "grad_norm": 1.376908066062954, "kl": 0.0033895899541676044, "learning_rate": 8.185840707964602e-07, "loss": -0.0631, "num_tokens": 7695354.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996504783630371, "sampling/importance_sampling_ratio/min": 0.20395119488239288, "sampling/sampling_logp_difference/max": 1.589874505996704, "sampling/sampling_logp_difference/mean": 0.014564632438123226, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 203.28125, "completions/mean_terminated_length": 203.28125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.23442362248897552, "epoch": 0.3309734513274336, "frac_reward_zero_std": 1.0, "grad_norm": 0.08915760119874058, "kl": 0.004916157107800245, "learning_rate": 8.230088495575221e-07, "loss": 0.0001, "num_tokens": 7718780.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8374145030975342, "sampling/importance_sampling_ratio/mean": 0.9992566108703613, "sampling/importance_sampling_ratio/min": 0.3954724073410034, "sampling/sampling_logp_difference/max": 0.9276742935180664, "sampling/sampling_logp_difference/mean": 0.015395093709230423, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 548.421875, "completions/mean_terminated_length": 548.421875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.208042174577713, "epoch": 0.3327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 0.8349409585831095, "kl": 0.0019054000731557608, "learning_rate": 8.274336283185839e-07, "loss": -0.0563, "num_tokens": 7764871.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9672428369522095, "sampling/importance_sampling_ratio/mean": 1.000666618347168, "sampling/importance_sampling_ratio/min": 0.5291227102279663, "sampling/sampling_logp_difference/max": 0.6766330003738403, "sampling/sampling_logp_difference/mean": 0.012018510140478611, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 196.3125, "completions/mean_terminated_length": 196.3125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.12341512739658356, "epoch": 0.3345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 1.6115993711157834, "kl": 0.0023949691094458103, "learning_rate": 8.318584070796459e-07, "loss": 0.0563, "num_tokens": 7787707.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002516508102417, "sampling/importance_sampling_ratio/min": 0.34007471799850464, "sampling/sampling_logp_difference/max": 1.078589916229248, "sampling/sampling_logp_difference/mean": 0.011371126398444176, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 729.46875, "completions/mean_terminated_length": 661.6825561523438, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2599371075630188, "epoch": 0.336283185840708, "frac_reward_zero_std": 0.5, "grad_norm": 0.7998415693616772, "kl": 0.0020317775197327137, "learning_rate": 8.362831858407079e-07, "loss": 0.164, "num_tokens": 7845609.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9894132614135742, "sampling/importance_sampling_ratio/mean": 1.000138282775879, "sampling/importance_sampling_ratio/min": 0.07503779232501984, "sampling/sampling_logp_difference/max": 2.5897634029388428, "sampling/sampling_logp_difference/mean": 0.013344532810151577, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 378.6875, "completions/mean_terminated_length": 378.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.15199874341487885, "epoch": 0.3380530973451327, "frac_reward_zero_std": 0.5, "grad_norm": 2.3287763253379676, "kl": 0.002543312031775713, "learning_rate": 8.407079646017698e-07, "loss": 0.0468, "num_tokens": 7880037.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000150442123413, "sampling/importance_sampling_ratio/min": 0.48238104581832886, "sampling/sampling_logp_difference/max": 0.7329144477844238, "sampling/sampling_logp_difference/mean": 0.011152156628668308, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 611.265625, "completions/mean_terminated_length": 611.265625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.18009495735168457, "epoch": 0.3398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 0.8805760270731827, "kl": 0.002695451956242323, "learning_rate": 8.451327433628318e-07, "loss": -0.1037, "num_tokens": 7931366.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001177787780762, "sampling/importance_sampling_ratio/min": 0.2623376250267029, "sampling/sampling_logp_difference/max": 1.3381229639053345, "sampling/sampling_logp_difference/mean": 0.011747724376618862, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 261.453125, "completions/mean_terminated_length": 261.453125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.157416433095932, "epoch": 0.3415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.5991658087503708, "kl": 0.0037445039488375187, "learning_rate": 8.495575221238938e-07, "loss": -0.1252, "num_tokens": 7959699.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.939391851425171, "sampling/importance_sampling_ratio/mean": 0.9999671578407288, "sampling/importance_sampling_ratio/min": 0.46061867475509644, "sampling/sampling_logp_difference/max": 0.7751847505569458, "sampling/sampling_logp_difference/mean": 0.012065252289175987, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 444.65625, "completions/mean_terminated_length": 444.65625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.19185349345207214, "epoch": 0.3433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 1.0812327631940553, "kl": 0.0025550995487719774, "learning_rate": 8.539823008849557e-07, "loss": 0.061, "num_tokens": 7998925.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.743949055671692, "sampling/importance_sampling_ratio/mean": 0.9997782111167908, "sampling/importance_sampling_ratio/min": 0.27208149433135986, "sampling/sampling_logp_difference/max": 1.3016536235809326, "sampling/sampling_logp_difference/mean": 0.012031259015202522, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 394.34375, "completions/mean_terminated_length": 394.34375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1715793013572693, "epoch": 0.34513274336283184, "frac_reward_zero_std": 0.75, "grad_norm": 1.0344543283425076, "kl": 0.004649247042834759, "learning_rate": 8.584070796460177e-07, "loss": -0.0766, "num_tokens": 8033955.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.8237683773040771, "sampling/importance_sampling_ratio/mean": 0.9994395971298218, "sampling/importance_sampling_ratio/min": 0.08262058347463608, "sampling/sampling_logp_difference/max": 2.4934964179992676, "sampling/sampling_logp_difference/mean": 0.012767571955919266, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 374.59375, "completions/mean_terminated_length": 374.59375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.19166353344917297, "epoch": 0.34690265486725663, "frac_reward_zero_std": 0.5, "grad_norm": 1.7335910919291089, "kl": 0.003463435685262084, "learning_rate": 8.628318584070797e-07, "loss": -0.0696, "num_tokens": 8068569.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997660517692566, "sampling/importance_sampling_ratio/min": 0.5127838253974915, "sampling/sampling_logp_difference/max": 0.714803159236908, "sampling/sampling_logp_difference/mean": 0.011144580319523811, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 193.671875, "completions/mean_terminated_length": 193.671875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.22175905108451843, "epoch": 0.3486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.029259605290606314, "kl": 0.0035678634885698557, "learning_rate": 8.672566371681415e-07, "loss": 0.0, "num_tokens": 8092004.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991790652275085, "sampling/importance_sampling_ratio/min": 0.4940502941608429, "sampling/sampling_logp_difference/max": 0.7324227094650269, "sampling/sampling_logp_difference/mean": 0.012926574796438217, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 183.953125, "completions/mean_terminated_length": 183.953125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1503394991159439, "epoch": 0.3504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.9390511363072107, "kl": 0.0037315944209694862, "learning_rate": 8.716814159292035e-07, "loss": -0.0512, "num_tokens": 8113841.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003222227096558, "sampling/importance_sampling_ratio/min": 0.4596952497959137, "sampling/sampling_logp_difference/max": 0.7771915197372437, "sampling/sampling_logp_difference/mean": 0.011560607701539993, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 541.234375, "completions/mean_terminated_length": 541.234375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2299424260854721, "epoch": 0.35221238938053095, "frac_reward_zero_std": 0.5, "grad_norm": 1.3079928215643064, "kl": 0.0028617731295526028, "learning_rate": 8.761061946902655e-07, "loss": -0.0158, "num_tokens": 8159408.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.819759488105774, "sampling/importance_sampling_ratio/mean": 1.0001378059387207, "sampling/importance_sampling_ratio/min": 0.3962882161140442, "sampling/sampling_logp_difference/max": 0.925613522529602, "sampling/sampling_logp_difference/mean": 0.01198701560497284, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 458.53125, "completions/mean_terminated_length": 458.53125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.36783725023269653, "epoch": 0.35398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 0.772773210473566, "kl": 0.0031918566673994064, "learning_rate": 8.805309734513274e-07, "loss": 0.0167, "num_tokens": 8206434.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6715614795684814, "sampling/importance_sampling_ratio/mean": 1.0001349449157715, "sampling/importance_sampling_ratio/min": 0.48821890354156494, "sampling/sampling_logp_difference/max": 0.7169914245605469, "sampling/sampling_logp_difference/mean": 0.016995549201965332, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 906.078125, "completions/mean_terminated_length": 906.078125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.29588305950164795, "epoch": 0.35575221238938054, "frac_reward_zero_std": 0.5, "grad_norm": 0.5997192810130185, "kl": 0.0019689842592924833, "learning_rate": 8.849557522123894e-07, "loss": 0.0221, "num_tokens": 8274359.0, "reward": -0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9885562658309937, "sampling/importance_sampling_ratio/mean": 0.9998965263366699, "sampling/importance_sampling_ratio/min": 0.2961016595363617, "sampling/sampling_logp_difference/max": 1.2170524597167969, "sampling/sampling_logp_difference/mean": 0.013122190721333027, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 541.375, "completions/mean_terminated_length": 541.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.283604234457016, "epoch": 0.35752212389380533, "frac_reward_zero_std": 0.75, "grad_norm": 0.6115133521782922, "kl": 0.0034805000759661198, "learning_rate": 8.893805309734513e-07, "loss": -0.0064, "num_tokens": 8321855.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00025475025177, "sampling/importance_sampling_ratio/min": 0.6130207777023315, "sampling/sampling_logp_difference/max": 0.96729576587677, "sampling/sampling_logp_difference/mean": 0.014091209508478642, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 593.421875, "completions/mean_terminated_length": 593.421875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.22805637121200562, "epoch": 0.35929203539823007, "frac_reward_zero_std": 0.25, "grad_norm": 0.9460903081097992, "kl": 0.0035057151690125465, "learning_rate": 8.938053097345132e-07, "loss": 0.0495, "num_tokens": 8369754.0, "reward": -0.03125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002315044403076, "sampling/importance_sampling_ratio/min": 0.3776979446411133, "sampling/sampling_logp_difference/max": 0.9736604690551758, "sampling/sampling_logp_difference/mean": 0.012657871469855309, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 528.5, "completions/mean_terminated_length": 528.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.2609961926937103, "epoch": 0.36106194690265486, "frac_reward_zero_std": 0.25, "grad_norm": 1.3855439764231048, "kl": 0.007439417764544487, "learning_rate": 8.982300884955752e-07, "loss": 0.0343, "num_tokens": 8416922.0, "reward": 0.46875, "reward_std": 0.625, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.671035885810852, "sampling/importance_sampling_ratio/mean": 0.9996469020843506, "sampling/importance_sampling_ratio/min": 0.004132171627134085, "sampling/sampling_logp_difference/max": 5.488952159881592, "sampling/sampling_logp_difference/mean": 0.013202858157455921, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 739.703125, "completions/mean_terminated_length": 739.703125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.2115984708070755, "epoch": 0.36283185840707965, "frac_reward_zero_std": 0.25, "grad_norm": 0.9157363949726249, "kl": 0.0027517015114426613, "learning_rate": 9.026548672566371e-07, "loss": -0.0079, "num_tokens": 8477207.0, "reward": 0.25, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002868175506592, "sampling/importance_sampling_ratio/min": 0.3052161633968353, "sampling/sampling_logp_difference/max": 1.2463932037353516, "sampling/sampling_logp_difference/mean": 0.011302687227725983, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 343.375, "completions/mean_terminated_length": 343.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1864355206489563, "epoch": 0.36460176991150445, "frac_reward_zero_std": 0.75, "grad_norm": 1.092339515475785, "kl": 0.004906933754682541, "learning_rate": 9.07079646017699e-07, "loss": 0.0091, "num_tokens": 8509583.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8597286939620972, "sampling/importance_sampling_ratio/mean": 1.0002615451812744, "sampling/importance_sampling_ratio/min": 0.10824179649353027, "sampling/sampling_logp_difference/max": 2.2233877182006836, "sampling/sampling_logp_difference/mean": 0.01300247386097908, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 346.046875, "completions/mean_terminated_length": 346.046875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.23395447432994843, "epoch": 0.3663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 1.3139124655728542, "kl": 0.005922802723944187, "learning_rate": 9.11504424778761e-07, "loss": 0.1252, "num_tokens": 8541826.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8363536596298218, "sampling/importance_sampling_ratio/mean": 1.0000998973846436, "sampling/importance_sampling_ratio/min": 0.43085721135139465, "sampling/sampling_logp_difference/max": 0.8419785499572754, "sampling/sampling_logp_difference/mean": 0.013699382543563843, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 671.703125, "completions/mean_terminated_length": 671.703125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.24440288543701172, "epoch": 0.368141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 0.7534167765627644, "kl": 0.0038180463016033173, "learning_rate": 9.159292035398229e-07, "loss": 0.0305, "num_tokens": 8595151.0, "reward": 0.65625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003528594970703, "sampling/importance_sampling_ratio/min": 0.4161541759967804, "sampling/sampling_logp_difference/max": 0.8766994476318359, "sampling/sampling_logp_difference/mean": 0.013492150232195854, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 172.28125, "completions/mean_terminated_length": 172.28125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.142439067363739, "epoch": 0.36991150442477877, "frac_reward_zero_std": 0.75, "grad_norm": 2.157249177292624, "kl": 0.005076379980891943, "learning_rate": 9.203539823008849e-07, "loss": -0.0756, "num_tokens": 8615249.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.698088526725769, "sampling/importance_sampling_ratio/mean": 1.0000438690185547, "sampling/importance_sampling_ratio/min": 0.5883660316467285, "sampling/sampling_logp_difference/max": 0.5304059982299805, "sampling/sampling_logp_difference/mean": 0.011075222864747047, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 5000.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 733.21875, "completions/mean_terminated_length": 595.5806274414062, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3009476065635681, "epoch": 0.37168141592920356, "frac_reward_zero_std": 0.75, "grad_norm": 0.4063124860431282, "kl": 0.004093495197594166, "learning_rate": 9.247787610619469e-07, "loss": -0.0113, "num_tokens": 8674527.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998041391372681, "sampling/importance_sampling_ratio/min": 0.09945426881313324, "sampling/sampling_logp_difference/max": 2.3080573081970215, "sampling/sampling_logp_difference/mean": 0.013135841116309166, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 437.953125, "completions/mean_terminated_length": 437.953125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3641192615032196, "epoch": 0.3734513274336283, "frac_reward_zero_std": 0.75, "grad_norm": 0.6928393774000238, "kl": 0.005903189070522785, "learning_rate": 9.292035398230088e-07, "loss": -0.0172, "num_tokens": 8714540.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.9441547393798828, "sampling/importance_sampling_ratio/mean": 1.0001696348190308, "sampling/importance_sampling_ratio/min": 0.319449782371521, "sampling/sampling_logp_difference/max": 1.1411552429199219, "sampling/sampling_logp_difference/mean": 0.015952803194522858, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.22673434019088745, "epoch": 0.3752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 0.9403504088215111, "kl": 0.004019421525299549, "learning_rate": 9.336283185840708e-07, "loss": -0.0483, "num_tokens": 8748308.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.697697401046753, "sampling/importance_sampling_ratio/mean": 0.9993868470191956, "sampling/importance_sampling_ratio/min": 0.5321142673492432, "sampling/sampling_logp_difference/max": 0.630897045135498, "sampling/sampling_logp_difference/mean": 0.0111259575933218, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 327.578125, "completions/mean_terminated_length": 327.578125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.35163745284080505, "epoch": 0.3769911504424779, "frac_reward_zero_std": 0.5, "grad_norm": 1.399517143595708, "kl": 0.005812834948301315, "learning_rate": 9.380530973451328e-07, "loss": 0.0159, "num_tokens": 8781929.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.691361665725708, "sampling/importance_sampling_ratio/mean": 1.0002861022949219, "sampling/importance_sampling_ratio/min": 0.5404882431030273, "sampling/sampling_logp_difference/max": 0.615282416343689, "sampling/sampling_logp_difference/mean": 0.01623188890516758, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 172.984375, "completions/mean_terminated_length": 172.984375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.14862021803855896, "epoch": 0.3787610619469027, "frac_reward_zero_std": 1.0, "grad_norm": 0.04109766785975544, "kl": 0.005391256883740425, "learning_rate": 9.424778761061947e-07, "loss": 0.0001, "num_tokens": 8802856.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.775957703590393, "sampling/importance_sampling_ratio/mean": 0.999342679977417, "sampling/importance_sampling_ratio/min": 0.4428269863128662, "sampling/sampling_logp_difference/max": 0.8145761489868164, "sampling/sampling_logp_difference/mean": 0.012859176844358444, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 221.046875, "completions/mean_terminated_length": 221.046875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.19737035036087036, "epoch": 0.3805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.03193114577490339, "kl": 0.00482964375987649, "learning_rate": 9.469026548672566e-07, "loss": 0.0, "num_tokens": 8827419.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6207282543182373, "sampling/importance_sampling_ratio/mean": 0.9994890689849854, "sampling/importance_sampling_ratio/min": 0.4360135793685913, "sampling/sampling_logp_difference/max": 0.8300819396972656, "sampling/sampling_logp_difference/mean": 0.012343251146376133, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 514.9375, "completions/mean_terminated_length": 514.9375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.18890772759914398, "epoch": 0.3823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 1.0257625405041908, "kl": 0.004021645523607731, "learning_rate": 9.513274336283185e-07, "loss": -0.029, "num_tokens": 8871799.0, "reward": 0.53125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.9442577362060547, "sampling/importance_sampling_ratio/mean": 1.0004653930664062, "sampling/importance_sampling_ratio/min": 0.31591761112213135, "sampling/sampling_logp_difference/max": 1.1522737741470337, "sampling/sampling_logp_difference/mean": 0.011563598178327084, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 267.328125, "completions/mean_terminated_length": 267.328125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.11982627958059311, "epoch": 0.384070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.025010324447661304, "kl": 0.003134357277303934, "learning_rate": 9.557522123893805e-07, "loss": 0.0, "num_tokens": 8899292.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.581999659538269, "sampling/importance_sampling_ratio/mean": 0.9999656677246094, "sampling/importance_sampling_ratio/min": 0.41625165939331055, "sampling/sampling_logp_difference/max": 0.8764653205871582, "sampling/sampling_logp_difference/mean": 0.008051315322518349, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 463.921875, "completions/mean_terminated_length": 463.921875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2790055274963379, "epoch": 0.3858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 1.0706510698632021, "kl": 0.004723875783383846, "learning_rate": 9.601769911504426e-07, "loss": -0.0144, "num_tokens": 8939255.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.910963535308838, "sampling/importance_sampling_ratio/mean": 1.0001851320266724, "sampling/importance_sampling_ratio/min": 0.5061306357383728, "sampling/sampling_logp_difference/max": 0.6809604167938232, "sampling/sampling_logp_difference/mean": 0.013315819203853607, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 309.703125, "completions/mean_terminated_length": 309.703125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.25548022985458374, "epoch": 0.38761061946902653, "frac_reward_zero_std": 0.5, "grad_norm": 1.6072562443341416, "kl": 0.00571413291618228, "learning_rate": 9.646017699115042e-07, "loss": 0.0095, "num_tokens": 8970020.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6626795530319214, "sampling/importance_sampling_ratio/mean": 0.9997296929359436, "sampling/importance_sampling_ratio/min": 0.4909794330596924, "sampling/sampling_logp_difference/max": 0.711353063583374, "sampling/sampling_logp_difference/mean": 0.014648938551545143, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3204.0, "completions/max_terminated_length": 3204.0, "completions/mean_length": 735.515625, "completions/mean_terminated_length": 735.515625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3314078450202942, "epoch": 0.3893805309734513, "frac_reward_zero_std": 0.0, "grad_norm": 1.142412312345563, "kl": 0.004277537111192942, "learning_rate": 9.690265486725663e-07, "loss": 0.0199, "num_tokens": 9028053.0, "reward": 0.34375, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001829862594604, "sampling/importance_sampling_ratio/min": 0.4775368273258209, "sampling/sampling_logp_difference/max": 1.754638671875, "sampling/sampling_logp_difference/mean": 0.015724822878837585, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 772.125, "completions/mean_terminated_length": 772.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.27522385120391846, "epoch": 0.3911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 0.46722417849046, "kl": 0.0052696820348501205, "learning_rate": 9.734513274336282e-07, "loss": -0.0361, "num_tokens": 9089197.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001323223114014, "sampling/importance_sampling_ratio/min": 0.2694660723209381, "sampling/sampling_logp_difference/max": 1.3113127946853638, "sampling/sampling_logp_difference/mean": 0.012968253344297409, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 5000.0, "completions/max_terminated_length": 3671.0, "completions/mean_length": 829.421875, "completions/mean_terminated_length": 694.8870849609375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.19309642910957336, "epoch": 0.3929203539823009, "frac_reward_zero_std": 0.5, "grad_norm": 0.8778082121810614, "kl": 0.004176372196525335, "learning_rate": 9.778761061946902e-07, "loss": 0.0127, "num_tokens": 9153384.0, "reward": 0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001963376998901, "sampling/importance_sampling_ratio/min": 0.2554081082344055, "sampling/sampling_logp_difference/max": 1.364892601966858, "sampling/sampling_logp_difference/mean": 0.01235814206302166, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3008.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 396.875, "completions/mean_terminated_length": 396.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3782493472099304, "epoch": 0.39469026548672564, "frac_reward_zero_std": 0.5, "grad_norm": 1.0749828792703966, "kl": 0.0047070980072021484, "learning_rate": 9.82300884955752e-07, "loss": -0.0773, "num_tokens": 9191296.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7217216491699219, "sampling/importance_sampling_ratio/mean": 1.0004417896270752, "sampling/importance_sampling_ratio/min": 0.40878215432167053, "sampling/sampling_logp_difference/max": 0.8945728540420532, "sampling/sampling_logp_difference/mean": 0.01634475588798523, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2017.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 435.421875, "completions/mean_terminated_length": 435.421875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.2289847582578659, "epoch": 0.39646017699115044, "frac_reward_zero_std": 0.75, "grad_norm": 0.906826729177233, "kl": 0.005832117050886154, "learning_rate": 9.867256637168142e-07, "loss": -0.0038, "num_tokens": 9230731.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.651212215423584, "sampling/importance_sampling_ratio/mean": 0.9999698400497437, "sampling/importance_sampling_ratio/min": 0.4756048619747162, "sampling/sampling_logp_difference/max": 0.7431678771972656, "sampling/sampling_logp_difference/mean": 0.012325853109359741, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 212.40625, "completions/mean_terminated_length": 212.40625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.22619274258613586, "epoch": 0.39823008849557523, "frac_reward_zero_std": 1.0, "grad_norm": 0.02604529759767604, "kl": 0.005297327414155006, "learning_rate": 9.91150442477876e-07, "loss": 0.0001, "num_tokens": 9254469.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999696612358093, "sampling/importance_sampling_ratio/min": 0.4309347867965698, "sampling/sampling_logp_difference/max": 0.9278104305267334, "sampling/sampling_logp_difference/mean": 0.013578859157860279, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 669.421875, "completions/mean_terminated_length": 669.421875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.18515875935554504, "epoch": 0.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.9114966413872505, "kl": 0.0036076661199331284, "learning_rate": 9.95575221238938e-07, "loss": 0.0535, "num_tokens": 9307280.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5746040344238281, "sampling/importance_sampling_ratio/mean": 0.9997274875640869, "sampling/importance_sampling_ratio/min": 0.4600610136985779, "sampling/sampling_logp_difference/max": 0.7763961553573608, "sampling/sampling_logp_difference/mean": 0.010419302619993687, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 472.125, "completions/mean_terminated_length": 472.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.30778488516807556, "epoch": 0.40176991150442476, "frac_reward_zero_std": 0.75, "grad_norm": 0.8527886064587072, "kl": 0.004933054558932781, "learning_rate": 1e-06, "loss": -0.0134, "num_tokens": 9348264.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.939113974571228, "sampling/importance_sampling_ratio/mean": 0.9995436668395996, "sampling/importance_sampling_ratio/min": 0.5835635662078857, "sampling/sampling_logp_difference/max": 0.6622312068939209, "sampling/sampling_logp_difference/mean": 0.015096191316843033, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 503.296875, "completions/mean_terminated_length": 503.296875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3045428395271301, "epoch": 0.40353982300884955, "frac_reward_zero_std": 0.0, "grad_norm": 1.4385489660549282, "kl": 0.00474332831799984, "learning_rate": 9.999994035998135e-07, "loss": 0.0703, "num_tokens": 9391963.0, "reward": 0.71875, "reward_std": 0.6601393222808838, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.8910765647888184, "sampling/importance_sampling_ratio/mean": 1.00005042552948, "sampling/importance_sampling_ratio/min": 0.4868103861808777, "sampling/sampling_logp_difference/max": 0.7198805809020996, "sampling/sampling_logp_difference/mean": 0.014491120353341103, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.13001316785812378, "epoch": 0.40530973451327434, "frac_reward_zero_std": 1.0, "grad_norm": 0.04992759452310381, "kl": 0.005523878149688244, "learning_rate": 9.99997614400677e-07, "loss": 0.0001, "num_tokens": 9412259.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991714954376221, "sampling/importance_sampling_ratio/min": 0.48889055848121643, "sampling/sampling_logp_difference/max": 0.7172503471374512, "sampling/sampling_logp_difference/mean": 0.012274734675884247, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 452.765625, "completions/mean_terminated_length": 452.765625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.170053631067276, "epoch": 0.40707964601769914, "frac_reward_zero_std": 0.5, "grad_norm": 1.0868115428039313, "kl": 0.006508652586489916, "learning_rate": 9.999946324068587e-07, "loss": 0.0413, "num_tokens": 9451332.0, "reward": 0.53125, "reward_std": 0.5143726468086243, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6489005088806152, "sampling/importance_sampling_ratio/mean": 0.9997778534889221, "sampling/importance_sampling_ratio/min": 0.5171679258346558, "sampling/sampling_logp_difference/max": 0.6593875885009766, "sampling/sampling_logp_difference/mean": 0.011356865987181664, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2209.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 487.21875, "completions/mean_terminated_length": 487.21875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2796103358268738, "epoch": 0.4088495575221239, "frac_reward_zero_std": 0.25, "grad_norm": 1.1695914912736824, "kl": 0.005334988236427307, "learning_rate": 9.999904576254724e-07, "loss": -0.0627, "num_tokens": 9494434.0, "reward": 0.28125, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.7919862270355225, "sampling/importance_sampling_ratio/mean": 0.9998686909675598, "sampling/importance_sampling_ratio/min": 0.512783944606781, "sampling/sampling_logp_difference/max": 0.6679006814956665, "sampling/sampling_logp_difference/mean": 0.01402884442359209, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 528.71875, "completions/mean_terminated_length": 528.71875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.20487293601036072, "epoch": 0.41061946902654867, "frac_reward_zero_std": 0.75, "grad_norm": 0.9967513653459517, "kl": 0.0041511282324790955, "learning_rate": 9.999850900664773e-07, "loss": 0.1356, "num_tokens": 9540080.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998377561569214, "sampling/importance_sampling_ratio/min": 0.6207963228225708, "sampling/sampling_logp_difference/max": 0.7263727188110352, "sampling/sampling_logp_difference/mean": 0.010950752533972263, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 344.4375, "completions/mean_terminated_length": 344.4375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.16642557084560394, "epoch": 0.41238938053097346, "frac_reward_zero_std": 0.5, "grad_norm": 1.227266110330366, "kl": 0.005534435156732798, "learning_rate": 9.999785297426788e-07, "loss": -0.0075, "num_tokens": 9571836.0, "reward": 0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6378382444381714, "sampling/importance_sampling_ratio/mean": 1.000166654586792, "sampling/importance_sampling_ratio/min": 0.4950157701969147, "sampling/sampling_logp_difference/max": 0.7031656503677368, "sampling/sampling_logp_difference/mean": 0.011384994722902775, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 668.59375, "completions/mean_terminated_length": 668.59375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23501507937908173, "epoch": 0.41415929203539825, "frac_reward_zero_std": 0.0, "grad_norm": 1.2885601758076681, "kl": 0.0032865270040929317, "learning_rate": 9.999707766697265e-07, "loss": -0.0021, "num_tokens": 9626354.0, "reward": 0.84375, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002360343933105, "sampling/importance_sampling_ratio/min": 0.31635382771492004, "sampling/sampling_logp_difference/max": 1.1508939266204834, "sampling/sampling_logp_difference/mean": 0.012331748381257057, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 340.09375, "completions/mean_terminated_length": 340.09375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.1572047472000122, "epoch": 0.415929203539823, "frac_reward_zero_std": 0.5, "grad_norm": 1.4489276054463438, "kl": 0.0035781720653176308, "learning_rate": 9.999618308661168e-07, "loss": -0.1569, "num_tokens": 9658792.0, "reward": -0.09375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8075003623962402, "sampling/importance_sampling_ratio/mean": 1.000446081161499, "sampling/importance_sampling_ratio/min": 0.5016233921051025, "sampling/sampling_logp_difference/max": 0.6899056434631348, "sampling/sampling_logp_difference/mean": 0.010497696697711945, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 284.15625, "completions/mean_terminated_length": 284.15625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.16873982548713684, "epoch": 0.4176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.1494484197412922, "kl": 0.004987464752048254, "learning_rate": 9.999516923531906e-07, "loss": 0.0066, "num_tokens": 9688354.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997761249542236, "sampling/importance_sampling_ratio/min": 0.2797534465789795, "sampling/sampling_logp_difference/max": 1.2738466262817383, "sampling/sampling_logp_difference/mean": 0.011232834309339523, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 419.265625, "completions/mean_terminated_length": 419.265625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.20896151661872864, "epoch": 0.4194690265486726, "frac_reward_zero_std": 0.5, "grad_norm": 1.258055788087232, "kl": 0.0044655390083789825, "learning_rate": 9.99940361155134e-07, "loss": -0.0333, "num_tokens": 9724163.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995315074920654, "sampling/importance_sampling_ratio/min": 0.47485029697418213, "sampling/sampling_logp_difference/max": 0.769787073135376, "sampling/sampling_logp_difference/mean": 0.012783035635948181, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 298.421875, "completions/mean_terminated_length": 298.421875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.14590927958488464, "epoch": 0.42123893805309737, "frac_reward_zero_std": 0.75, "grad_norm": 1.2046051675611065, "kl": 0.006410642061382532, "learning_rate": 9.99927837298979e-07, "loss": -0.0489, "num_tokens": 9754334.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000205039978027, "sampling/importance_sampling_ratio/min": 0.4748923182487488, "sampling/sampling_logp_difference/max": 0.8144280910491943, "sampling/sampling_logp_difference/mean": 0.008721040561795235, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3612.0, "completions/max_terminated_length": 3612.0, "completions/mean_length": 471.875, "completions/mean_terminated_length": 471.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.23433303833007812, "epoch": 0.4230088495575221, "frac_reward_zero_std": 0.5, "grad_norm": 1.177808762545866, "kl": 0.00575688760727644, "learning_rate": 9.999141208146027e-07, "loss": -0.0079, "num_tokens": 9794758.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000319242477417, "sampling/importance_sampling_ratio/min": 0.5278641581535339, "sampling/sampling_logp_difference/max": 0.844780445098877, "sampling/sampling_logp_difference/mean": 0.013401404023170471, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 391.015625, "completions/mean_terminated_length": 391.015625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3013729453086853, "epoch": 0.4247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 0.9114344910624455, "kl": 0.004888847470283508, "learning_rate": 9.99899211734727e-07, "loss": 0.0227, "num_tokens": 9832023.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00018310546875, "sampling/importance_sampling_ratio/min": 0.5164895057678223, "sampling/sampling_logp_difference/max": 1.0100163221359253, "sampling/sampling_logp_difference/mean": 0.015662696212530136, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 486.78125, "completions/mean_terminated_length": 486.78125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3640768527984619, "epoch": 0.4265486725663717, "frac_reward_zero_std": 0.0, "grad_norm": 1.3673288559504178, "kl": 0.005445628426969051, "learning_rate": 9.998831100949186e-07, "loss": -0.0902, "num_tokens": 9873417.0, "reward": 0.0, "reward_std": 0.5915650129318237, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6958982944488525, "sampling/importance_sampling_ratio/mean": 1.0000355243682861, "sampling/importance_sampling_ratio/min": 0.4765286445617676, "sampling/sampling_logp_difference/max": 0.741227388381958, "sampling/sampling_logp_difference/mean": 0.017477478832006454, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 573.453125, "completions/mean_terminated_length": 573.453125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2605985403060913, "epoch": 0.4283185840707965, "frac_reward_zero_std": 1.0, "grad_norm": 0.012838900809320587, "kl": 0.004675053060054779, "learning_rate": 9.998658159335901e-07, "loss": 0.0, "num_tokens": 9924294.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992914795875549, "sampling/importance_sampling_ratio/min": 0.5363118052482605, "sampling/sampling_logp_difference/max": 1.2172951698303223, "sampling/sampling_logp_difference/mean": 0.013394870795309544, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3233.0, "completions/max_terminated_length": 3233.0, "completions/mean_length": 461.484375, "completions/mean_terminated_length": 461.484375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.12216609716415405, "epoch": 0.4300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 0.954197307572884, "kl": 0.004707361571490765, "learning_rate": 9.998473292919985e-07, "loss": 0.0247, "num_tokens": 9965205.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7521675825119019, "sampling/importance_sampling_ratio/mean": 0.9998599290847778, "sampling/importance_sampling_ratio/min": 0.4774567782878876, "sampling/sampling_logp_difference/max": 0.7392816543579102, "sampling/sampling_logp_difference/mean": 0.009530564770102501, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 692.0625, "completions/mean_terminated_length": 692.0625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.1859404444694519, "epoch": 0.431858407079646, "frac_reward_zero_std": 0.25, "grad_norm": 2.4376465026829477, "kl": 0.004119847901165485, "learning_rate": 9.998276502142454e-07, "loss": 0.086, "num_tokens": 10024361.0, "reward": -0.28125, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997283816337585, "sampling/importance_sampling_ratio/min": 0.38532912731170654, "sampling/sampling_logp_difference/max": 0.9536573886871338, "sampling/sampling_logp_difference/mean": 0.011350534856319427, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 350.984375, "completions/mean_terminated_length": 350.984375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.12020367383956909, "epoch": 0.4336283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 0.9322166084957804, "kl": 0.004303342662751675, "learning_rate": 9.99806778747277e-07, "loss": 0.0136, "num_tokens": 10055912.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.8200910091400146, "sampling/importance_sampling_ratio/mean": 1.000126600265503, "sampling/importance_sampling_ratio/min": 0.5905990600585938, "sampling/sampling_logp_difference/max": 0.5988864898681641, "sampling/sampling_logp_difference/mean": 0.009075840935111046, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 444.34375, "completions/mean_terminated_length": 444.34375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.22033430635929108, "epoch": 0.4353982300884956, "frac_reward_zero_std": 0.5, "grad_norm": 1.2647385390073633, "kl": 0.005501553416252136, "learning_rate": 9.997847149408844e-07, "loss": 0.0295, "num_tokens": 10094974.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.9879121780395508, "sampling/importance_sampling_ratio/mean": 1.000058650970459, "sampling/importance_sampling_ratio/min": 0.47875964641571045, "sampling/sampling_logp_difference/max": 0.7365565896034241, "sampling/sampling_logp_difference/mean": 0.013632528483867645, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 481.171875, "completions/mean_terminated_length": 481.171875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.1847860962152481, "epoch": 0.43716814159292033, "frac_reward_zero_std": 0.5, "grad_norm": 1.1076686736674717, "kl": 0.004583002999424934, "learning_rate": 9.997614588477033e-07, "loss": 0.031, "num_tokens": 10135833.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000940561294556, "sampling/importance_sampling_ratio/min": 0.41096746921539307, "sampling/sampling_logp_difference/max": 0.8892412185668945, "sampling/sampling_logp_difference/mean": 0.011974984779953957, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 594.75, "completions/mean_terminated_length": 594.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.18086057901382446, "epoch": 0.4389380530973451, "frac_reward_zero_std": 0.25, "grad_norm": 1.096550710504024, "kl": 0.004791299346834421, "learning_rate": 9.997370105232132e-07, "loss": 0.0164, "num_tokens": 10184681.0, "reward": 0.0, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002553462982178, "sampling/importance_sampling_ratio/min": 0.4453330934047699, "sampling/sampling_logp_difference/max": 0.8089327812194824, "sampling/sampling_logp_difference/mean": 0.011728532612323761, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 570.578125, "completions/mean_terminated_length": 570.578125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.22292925417423248, "epoch": 0.4407079646017699, "frac_reward_zero_std": 0.5, "grad_norm": 0.7907254609058868, "kl": 0.003868166357278824, "learning_rate": 9.99711370025738e-07, "loss": 0.0208, "num_tokens": 10232446.0, "reward": 0.21875, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5775072574615479, "sampling/importance_sampling_ratio/mean": 0.9998824596405029, "sampling/importance_sampling_ratio/min": 0.4077136814594269, "sampling/sampling_logp_difference/max": 0.8971900939941406, "sampling/sampling_logp_difference/mean": 0.0110623212531209, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 464.84375, "completions/mean_terminated_length": 464.84375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2809881567955017, "epoch": 0.4424778761061947, "frac_reward_zero_std": 0.25, "grad_norm": 1.5516165223633, "kl": 0.00607761787250638, "learning_rate": 9.99684537416446e-07, "loss": -0.1438, "num_tokens": 10278404.0, "reward": 0.15625, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996684789657593, "sampling/importance_sampling_ratio/min": 0.3722674548625946, "sampling/sampling_logp_difference/max": 0.988142728805542, "sampling/sampling_logp_difference/mean": 0.015020671300590038, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.15499556064605713, "epoch": 0.44424778761061945, "frac_reward_zero_std": 0.75, "grad_norm": 1.2981171454863263, "kl": 0.005207214504480362, "learning_rate": 9.996565127593489e-07, "loss": -0.0235, "num_tokens": 10309124.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998096823692322, "sampling/importance_sampling_ratio/min": 0.4807486832141876, "sampling/sampling_logp_difference/max": 0.9487627744674683, "sampling/sampling_logp_difference/mean": 0.010623264126479626, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.14062917232513428, "epoch": 0.44601769911504424, "frac_reward_zero_std": 1.0, "grad_norm": 0.10313218410730836, "kl": 0.010691756382584572, "learning_rate": 9.996272961213022e-07, "loss": 0.0001, "num_tokens": 10330372.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9938220977783203, "sampling/importance_sampling_ratio/mean": 1.0011367797851562, "sampling/importance_sampling_ratio/min": 0.32394370436668396, "sampling/sampling_logp_difference/max": 1.127185583114624, "sampling/sampling_logp_difference/mean": 0.011647550389170647, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 380.421875, "completions/mean_terminated_length": 380.421875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.1982000470161438, "epoch": 0.44778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 0.8772838236577051, "kl": 0.00663679838180542, "learning_rate": 9.995968875720051e-07, "loss": 0.0455, "num_tokens": 10365711.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998120665550232, "sampling/importance_sampling_ratio/min": 0.18541958928108215, "sampling/sampling_logp_difference/max": 1.685133934020996, "sampling/sampling_logp_difference/mean": 0.012058432213962078, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 322.46875, "completions/mean_terminated_length": 322.46875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.21152693033218384, "epoch": 0.4495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 1.315012467270106, "kl": 0.00522711593657732, "learning_rate": 9.995652871840006e-07, "loss": 0.0365, "num_tokens": 10396925.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9397406578063965, "sampling/importance_sampling_ratio/mean": 1.0003541707992554, "sampling/importance_sampling_ratio/min": 0.3467358648777008, "sampling/sampling_logp_difference/max": 1.0591919422149658, "sampling/sampling_logp_difference/mean": 0.012692416086792946, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 253.1875, "completions/mean_terminated_length": 253.1875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.25905999541282654, "epoch": 0.45132743362831856, "frac_reward_zero_std": 0.5, "grad_norm": 1.7247664899033959, "kl": 0.006584254093468189, "learning_rate": 9.995324950326745e-07, "loss": -0.0204, "num_tokens": 10422889.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5973844528198242, "sampling/importance_sampling_ratio/mean": 1.000280737876892, "sampling/importance_sampling_ratio/min": 0.503616213798523, "sampling/sampling_logp_difference/max": 0.6859407424926758, "sampling/sampling_logp_difference/mean": 0.01665029115974903, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 338.4375, "completions/mean_terminated_length": 338.4375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.15474480390548706, "epoch": 0.45309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.263921253127358, "kl": 0.006079681217670441, "learning_rate": 9.994985111962555e-07, "loss": -0.0138, "num_tokens": 10454741.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.948596715927124, "sampling/importance_sampling_ratio/mean": 0.9998730421066284, "sampling/importance_sampling_ratio/min": 0.47965747117996216, "sampling/sampling_logp_difference/max": 0.7346830368041992, "sampling/sampling_logp_difference/mean": 0.01146610639989376, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 573.765625, "completions/mean_terminated_length": 573.765625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.13780876994132996, "epoch": 0.45486725663716815, "frac_reward_zero_std": 0.5, "grad_norm": 0.8760485191257505, "kl": 0.0043847025372087955, "learning_rate": 9.994633357558158e-07, "loss": 0.0153, "num_tokens": 10501462.0, "reward": 0.5625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.755411148071289, "sampling/importance_sampling_ratio/mean": 1.0000338554382324, "sampling/importance_sampling_ratio/min": 0.531851589679718, "sampling/sampling_logp_difference/max": 0.6313908100128174, "sampling/sampling_logp_difference/mean": 0.008243165910243988, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 268.515625, "completions/mean_terminated_length": 268.515625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.23254260420799255, "epoch": 0.45663716814159294, "frac_reward_zero_std": 1.0, "grad_norm": 0.03507715718832432, "kl": 0.008519462309777737, "learning_rate": 9.994269687952698e-07, "loss": 0.0001, "num_tokens": 10529591.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9357554912567139, "sampling/importance_sampling_ratio/mean": 0.9991658329963684, "sampling/importance_sampling_ratio/min": 0.4582422375679016, "sampling/sampling_logp_difference/max": 0.7803573608398438, "sampling/sampling_logp_difference/mean": 0.014997686259448528, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1852.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 305.90625, "completions/mean_terminated_length": 305.90625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.17524272203445435, "epoch": 0.4584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 1.7555659184753745, "kl": 0.006742289289832115, "learning_rate": 9.993894104013746e-07, "loss": 0.014, "num_tokens": 10572017.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000298023223877, "sampling/importance_sampling_ratio/min": 0.1917005181312561, "sampling/sampling_logp_difference/max": 1.9634878635406494, "sampling/sampling_logp_difference/mean": 0.011404123157262802, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2090.0, "completions/max_terminated_length": 2090.0, "completions/mean_length": 642.703125, "completions/mean_terminated_length": 642.703125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.23945535719394684, "epoch": 0.46017699115044247, "frac_reward_zero_std": 0.5, "grad_norm": 0.6624267620875083, "kl": 0.005442580208182335, "learning_rate": 9.993506606637296e-07, "loss": 0.0621, "num_tokens": 10624302.0, "reward": 0.34375, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999839067459106, "sampling/importance_sampling_ratio/min": 0.47325387597084045, "sampling/sampling_logp_difference/max": 0.748123288154602, "sampling/sampling_logp_difference/mean": 0.012176629155874252, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 568.9375, "completions/mean_terminated_length": 568.9375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2287563681602478, "epoch": 0.46194690265486726, "frac_reward_zero_std": 0.75, "grad_norm": 0.7877108423439778, "kl": 0.006636826787143946, "learning_rate": 9.993107196747758e-07, "loss": -0.0027, "num_tokens": 10671082.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7604458332061768, "sampling/importance_sampling_ratio/mean": 0.9996521472930908, "sampling/importance_sampling_ratio/min": 0.44379737973213196, "sampling/sampling_logp_difference/max": 0.812387228012085, "sampling/sampling_logp_difference/mean": 0.013848816975951195, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3545.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 698.234375, "completions/mean_terminated_length": 698.234375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15046873688697815, "epoch": 0.46371681415929206, "frac_reward_zero_std": 0.5, "grad_norm": 1.495015080481482, "kl": 0.00650195823982358, "learning_rate": 9.99269587529797e-07, "loss": -0.033, "num_tokens": 10727865.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001826286315918, "sampling/importance_sampling_ratio/min": 0.14807982742786407, "sampling/sampling_logp_difference/max": 1.9100037813186646, "sampling/sampling_logp_difference/mean": 0.008978975005447865, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2752.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 845.96875, "completions/mean_terminated_length": 845.96875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.23145925998687744, "epoch": 0.4654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 0.4542336655311234, "kl": 0.005094888154417276, "learning_rate": 9.99227264326918e-07, "loss": -0.0238, "num_tokens": 10794407.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999438524246216, "sampling/importance_sampling_ratio/min": 0.46952179074287415, "sampling/sampling_logp_difference/max": 1.7117395401000977, "sampling/sampling_logp_difference/mean": 0.011501516215503216, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 963.984375, "completions/mean_terminated_length": 963.984375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.1951884925365448, "epoch": 0.4672566371681416, "frac_reward_zero_std": 0.25, "grad_norm": 0.5299784027319835, "kl": 0.0042163291946053505, "learning_rate": 9.991837501671048e-07, "loss": -0.0211, "num_tokens": 10867718.0, "reward": 0.34375, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997589588165283, "sampling/importance_sampling_ratio/min": 0.008697095327079296, "sampling/sampling_logp_difference/max": 4.7447662353515625, "sampling/sampling_logp_difference/mean": 0.009495781734585762, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 373.671875, "completions/mean_terminated_length": 373.671875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.138139545917511, "epoch": 0.4690265486725664, "frac_reward_zero_std": 0.75, "grad_norm": 1.2040360205500007, "kl": 0.009060091339051723, "learning_rate": 9.991390451541648e-07, "loss": -0.0181, "num_tokens": 10902177.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6899785995483398, "sampling/importance_sampling_ratio/mean": 1.0000088214874268, "sampling/importance_sampling_ratio/min": 0.18904471397399902, "sampling/sampling_logp_difference/max": 1.665771722793579, "sampling/sampling_logp_difference/mean": 0.009717348963022232, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 217.15625, "completions/mean_terminated_length": 217.15625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.14752766489982605, "epoch": 0.47079646017699117, "frac_reward_zero_std": 1.0, "grad_norm": 0.050150487110867815, "kl": 0.010790691711008549, "learning_rate": 9.990931493947465e-07, "loss": 0.0001, "num_tokens": 10925835.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993729591369629, "sampling/importance_sampling_ratio/min": 0.3675483465194702, "sampling/sampling_logp_difference/max": 1.1000807285308838, "sampling/sampling_logp_difference/mean": 0.01257230807095766, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 656.703125, "completions/mean_terminated_length": 656.703125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.17602857947349548, "epoch": 0.4725663716814159, "frac_reward_zero_std": 0.0, "grad_norm": 1.0894071692688272, "kl": 0.005736640188843012, "learning_rate": 9.990460629983388e-07, "loss": 0.0598, "num_tokens": 10978728.0, "reward": -0.125, "reward_std": 0.79929518699646, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.8890631198883057, "sampling/importance_sampling_ratio/mean": 0.999754011631012, "sampling/importance_sampling_ratio/min": 0.34798887372016907, "sampling/sampling_logp_difference/max": 1.0555846691131592, "sampling/sampling_logp_difference/mean": 0.010623082518577576, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 781.78125, "completions/mean_terminated_length": 781.78125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3119831681251526, "epoch": 0.4743362831858407, "frac_reward_zero_std": 0.25, "grad_norm": 0.8756128578793853, "kl": 0.00497186416760087, "learning_rate": 9.98997786077271e-07, "loss": -0.0822, "num_tokens": 11042106.0, "reward": 0.0, "reward_std": 0.6991121172904968, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6922791004180908, "sampling/importance_sampling_ratio/mean": 0.9999416470527649, "sampling/importance_sampling_ratio/min": 0.21932797133922577, "sampling/sampling_logp_difference/max": 1.5171871185302734, "sampling/sampling_logp_difference/mean": 0.013723069801926613, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 384.796875, "completions/mean_terminated_length": 384.796875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.12295544147491455, "epoch": 0.4761061946902655, "frac_reward_zero_std": 0.75, "grad_norm": 1.0659393088865086, "kl": 0.006256372667849064, "learning_rate": 9.989483187467125e-07, "loss": 0.0054, "num_tokens": 11079789.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.8298978805541992, "sampling/importance_sampling_ratio/mean": 0.9999754428863525, "sampling/importance_sampling_ratio/min": 0.45907407999038696, "sampling/sampling_logp_difference/max": 0.7785437107086182, "sampling/sampling_logp_difference/mean": 0.009125998243689537, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 471.578125, "completions/mean_terminated_length": 471.578125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.17768463492393494, "epoch": 0.4778761061946903, "frac_reward_zero_std": 0.5, "grad_norm": 0.8945283166750596, "kl": 0.006092969328165054, "learning_rate": 9.988976611246728e-07, "loss": 0.0062, "num_tokens": 11119266.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001188516616821, "sampling/importance_sampling_ratio/min": 0.36833488941192627, "sampling/sampling_logp_difference/max": 1.3387887477874756, "sampling/sampling_logp_difference/mean": 0.010035617277026176, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 394.90625, "completions/mean_terminated_length": 394.90625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.13202810287475586, "epoch": 0.479646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.9422622692353794, "kl": 0.00863722525537014, "learning_rate": 9.988458133320008e-07, "loss": -0.0036, "num_tokens": 11154252.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7253721952438354, "sampling/importance_sampling_ratio/mean": 0.9998292922973633, "sampling/importance_sampling_ratio/min": 0.5087507963180542, "sampling/sampling_logp_difference/max": 0.6757969856262207, "sampling/sampling_logp_difference/mean": 0.010302330367267132, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 261.609375, "completions/mean_terminated_length": 261.609375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.21637436747550964, "epoch": 0.4814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 1.7830206374040702, "kl": 0.009296851232647896, "learning_rate": 9.987927754923843e-07, "loss": 0.0378, "num_tokens": 11185395.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8093221187591553, "sampling/importance_sampling_ratio/mean": 0.9997571706771851, "sampling/importance_sampling_ratio/min": 0.4769737422466278, "sampling/sampling_logp_difference/max": 0.7402938604354858, "sampling/sampling_logp_difference/mean": 0.014612831175327301, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 221.515625, "completions/mean_terminated_length": 221.515625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.13294382393360138, "epoch": 0.4831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 4.243110120859659, "kl": 0.009196151047945023, "learning_rate": 9.987385477323506e-07, "loss": 0.0657, "num_tokens": 11211300.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6990346908569336, "sampling/importance_sampling_ratio/mean": 1.0003771781921387, "sampling/importance_sampling_ratio/min": 0.4159530997276306, "sampling/sampling_logp_difference/max": 0.8771827220916748, "sampling/sampling_logp_difference/mean": 0.011125761084258556, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 421.21875, "completions/mean_terminated_length": 421.21875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.1351892054080963, "epoch": 0.4849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 0.7398393169424258, "kl": 0.012169597670435905, "learning_rate": 9.986831301812655e-07, "loss": -0.0167, "num_tokens": 11249554.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000317096710205, "sampling/importance_sampling_ratio/min": 0.373901903629303, "sampling/sampling_logp_difference/max": 0.9837617874145508, "sampling/sampling_logp_difference/mean": 0.009750946424901485, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 180.09375, "completions/mean_terminated_length": 180.09375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.11648742854595184, "epoch": 0.48672566371681414, "frac_reward_zero_std": 1.0, "grad_norm": 0.054983827731434766, "kl": 0.009736943989992142, "learning_rate": 9.98626522971333e-07, "loss": 0.0001, "num_tokens": 11271320.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8125369548797607, "sampling/importance_sampling_ratio/mean": 0.9996104836463928, "sampling/importance_sampling_ratio/min": 0.31732073426246643, "sampling/sampling_logp_difference/max": 1.1478421688079834, "sampling/sampling_logp_difference/mean": 0.011495426297187805, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 254.28125, "completions/mean_terminated_length": 254.28125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3037741184234619, "epoch": 0.48849557522123893, "frac_reward_zero_std": 0.75, "grad_norm": 1.3999569963532275, "kl": 0.009985791519284248, "learning_rate": 9.985687262375956e-07, "loss": 0.0013, "num_tokens": 11306714.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004408359527588, "sampling/importance_sampling_ratio/min": 0.6045938730239868, "sampling/sampling_logp_difference/max": 0.7120962142944336, "sampling/sampling_logp_difference/mean": 0.017715346068143845, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 199.53125, "completions/mean_terminated_length": 199.53125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.11616982519626617, "epoch": 0.4902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 1.2784864756365708, "kl": 0.02533717267215252, "learning_rate": 9.985097401179333e-07, "loss": -0.0259, "num_tokens": 11328252.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9057680368423462, "sampling/importance_sampling_ratio/mean": 0.9997613430023193, "sampling/importance_sampling_ratio/min": 0.25310593843460083, "sampling/sampling_logp_difference/max": 1.3739471435546875, "sampling/sampling_logp_difference/mean": 0.009998338297009468, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2054.0, "completions/max_terminated_length": 2054.0, "completions/mean_length": 552.203125, "completions/mean_terminated_length": 552.203125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.17685937881469727, "epoch": 0.4920353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 1.0358260701595456, "kl": 0.010021710768342018, "learning_rate": 9.98449564753063e-07, "loss": -0.0191, "num_tokens": 11375177.0, "reward": 0.34375, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9441547393798828, "sampling/importance_sampling_ratio/mean": 0.9998010993003845, "sampling/importance_sampling_ratio/min": 0.36549416184425354, "sampling/sampling_logp_difference/max": 1.006505012512207, "sampling/sampling_logp_difference/mean": 0.0112505704164505, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 303.703125, "completions/mean_terminated_length": 303.703125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.17026452720165253, "epoch": 0.49380530973451325, "frac_reward_zero_std": 1.0, "grad_norm": 0.03860034223259007, "kl": 0.009697364643216133, "learning_rate": 9.98388200286539e-07, "loss": 0.0001, "num_tokens": 11404518.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000622034072876, "sampling/importance_sampling_ratio/min": 0.4835500717163086, "sampling/sampling_logp_difference/max": 0.8470497131347656, "sampling/sampling_logp_difference/mean": 0.011559177190065384, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 194.984375, "completions/mean_terminated_length": 194.984375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1362389177083969, "epoch": 0.49557522123893805, "frac_reward_zero_std": 0.75, "grad_norm": 1.6689610040161855, "kl": 0.01827118545770645, "learning_rate": 9.98325646864753e-07, "loss": 0.0301, "num_tokens": 11426053.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003249645233154, "sampling/importance_sampling_ratio/min": 0.4033818542957306, "sampling/sampling_logp_difference/max": 0.907871663570404, "sampling/sampling_logp_difference/mean": 0.011498728767037392, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 449.65625, "completions/mean_terminated_length": 449.65625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.12879258394241333, "epoch": 0.49734513274336284, "frac_reward_zero_std": 0.5, "grad_norm": 1.2208721948260324, "kl": 0.010300518944859505, "learning_rate": 9.98261904636932e-07, "loss": 0.0412, "num_tokens": 11465087.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999481737613678, "sampling/importance_sampling_ratio/min": 0.3408910036087036, "sampling/sampling_logp_difference/max": 1.0761924982070923, "sampling/sampling_logp_difference/mean": 0.011025318875908852, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 680.328125, "completions/mean_terminated_length": 680.328125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.2792157530784607, "epoch": 0.49911504424778763, "frac_reward_zero_std": 0.25, "grad_norm": 1.0964896454673854, "kl": 0.007364148274064064, "learning_rate": 9.9819697375514e-07, "loss": -0.0388, "num_tokens": 11528612.0, "reward": 0.03125, "reward_std": 0.6424696445465088, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9519240856170654, "sampling/importance_sampling_ratio/mean": 0.9999142289161682, "sampling/importance_sampling_ratio/min": 0.254190057516098, "sampling/sampling_logp_difference/max": 1.3696730136871338, "sampling/sampling_logp_difference/mean": 0.014089124277234077, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4356.0, "completions/max_terminated_length": 4356.0, "completions/mean_length": 779.015625, "completions/mean_terminated_length": 779.015625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.2960992157459259, "epoch": 0.5008849557522124, "frac_reward_zero_std": 0.25, "grad_norm": 1.2545355539947702, "kl": 0.007914695888757706, "learning_rate": 9.981308543742756e-07, "loss": 0.0279, "num_tokens": 11592149.0, "reward": 0.3125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000016450881958, "sampling/importance_sampling_ratio/min": 0.34091272950172424, "sampling/sampling_logp_difference/max": 1.4383389949798584, "sampling/sampling_logp_difference/mean": 0.014335853978991508, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 534.75, "completions/mean_terminated_length": 463.873046875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.15541291236877441, "epoch": 0.5026548672566372, "frac_reward_zero_std": 0.5, "grad_norm": 0.9594687004885091, "kl": 0.010883474722504616, "learning_rate": 9.980635466520736e-07, "loss": 0.4606, "num_tokens": 11638821.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000563859939575, "sampling/importance_sampling_ratio/min": 0.21919138729572296, "sampling/sampling_logp_difference/max": 1.5178101062774658, "sampling/sampling_logp_difference/mean": 0.010257912799715996, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4208.0, "completions/max_terminated_length": 4208.0, "completions/mean_length": 411.734375, "completions/mean_terminated_length": 411.734375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.25683048367500305, "epoch": 0.504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 1.3520236817533786, "kl": 0.008014000952243805, "learning_rate": 9.979950507491033e-07, "loss": -0.0127, "num_tokens": 11680852.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002806186676025, "sampling/importance_sampling_ratio/min": 0.2822794020175934, "sampling/sampling_logp_difference/max": 1.2648578882217407, "sampling/sampling_logp_difference/mean": 0.012026640586555004, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 607.671875, "completions/mean_terminated_length": 607.671875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.12263226509094238, "epoch": 0.5061946902654867, "frac_reward_zero_std": 0.25, "grad_norm": 1.0029041963149465, "kl": 0.009896899573504925, "learning_rate": 9.979253668287685e-07, "loss": -0.021, "num_tokens": 11731007.0, "reward": 0.84375, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.915545105934143, "sampling/importance_sampling_ratio/mean": 0.9998985528945923, "sampling/importance_sampling_ratio/min": 0.3284125328063965, "sampling/sampling_logp_difference/max": 1.1134847402572632, "sampling/sampling_logp_difference/mean": 0.008629858493804932, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 824.578125, "completions/mean_terminated_length": 824.578125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.14986814558506012, "epoch": 0.5079646017699115, "frac_reward_zero_std": 0.25, "grad_norm": 0.8361532366505241, "kl": 0.007227468304336071, "learning_rate": 9.978544950573073e-07, "loss": 0.0121, "num_tokens": 11793268.0, "reward": 0.3125, "reward_std": 0.75, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6320658922195435, "sampling/importance_sampling_ratio/mean": 1.0000710487365723, "sampling/importance_sampling_ratio/min": 0.405645489692688, "sampling/sampling_logp_difference/max": 0.9022756814956665, "sampling/sampling_logp_difference/mean": 0.009264053776860237, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 529.0, "completions/mean_terminated_length": 529.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.13563679158687592, "epoch": 0.5097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 0.7421037147818133, "kl": 0.010230718180537224, "learning_rate": 9.977824356037915e-07, "loss": -0.007, "num_tokens": 11836772.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999891519546509, "sampling/importance_sampling_ratio/min": 0.10850759595632553, "sampling/sampling_logp_difference/max": 2.220935106277466, "sampling/sampling_logp_difference/mean": 0.010670032352209091, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 631.078125, "completions/mean_terminated_length": 631.078125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.14442600309848785, "epoch": 0.511504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.1025815863584985, "kl": 0.009865526109933853, "learning_rate": 9.97709188640126e-07, "loss": -0.0392, "num_tokens": 11889657.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000139832496643, "sampling/importance_sampling_ratio/min": 0.47453203797340393, "sampling/sampling_logp_difference/max": 1.163041353225708, "sampling/sampling_logp_difference/mean": 0.009098293259739876, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 625.859375, "completions/mean_terminated_length": 625.859375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2431594729423523, "epoch": 0.5132743362831859, "frac_reward_zero_std": 0.5, "grad_norm": 0.9361632048031646, "kl": 0.011102709919214249, "learning_rate": 9.976347543410486e-07, "loss": 0.0289, "num_tokens": 11946704.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998801350593567, "sampling/importance_sampling_ratio/min": 0.0005892282351851463, "sampling/sampling_logp_difference/max": 7.436697006225586, "sampling/sampling_logp_difference/mean": 0.013465146534144878, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 506.15625, "completions/mean_terminated_length": 506.15625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.21128441393375397, "epoch": 0.5150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 1.1268282876466285, "kl": 0.011778272688388824, "learning_rate": 9.975591328841304e-07, "loss": 0.0746, "num_tokens": 11990506.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.928079605102539, "sampling/importance_sampling_ratio/mean": 0.9999392032623291, "sampling/importance_sampling_ratio/min": 0.40911245346069336, "sampling/sampling_logp_difference/max": 0.8937652111053467, "sampling/sampling_logp_difference/mean": 0.011802196502685547, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2490.0, "completions/max_terminated_length": 2490.0, "completions/mean_length": 525.140625, "completions/mean_terminated_length": 525.140625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.352263867855072, "epoch": 0.5168141592920354, "frac_reward_zero_std": 0.25, "grad_norm": 1.1247006767339436, "kl": 0.0126701844856143, "learning_rate": 9.974823244497737e-07, "loss": 0.0324, "num_tokens": 12035715.0, "reward": -0.03125, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 1.0002052783966064, "sampling/importance_sampling_ratio/min": 0.6067672967910767, "sampling/sampling_logp_difference/max": 0.49960994720458984, "sampling/sampling_logp_difference/mean": 0.016241956502199173, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 745.9375, "completions/mean_terminated_length": 745.9375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1272304803133011, "epoch": 0.5185840707964602, "frac_reward_zero_std": 0.5, "grad_norm": 0.7615398341673056, "kl": 0.008814724162220955, "learning_rate": 9.974043292212127e-07, "loss": -0.0569, "num_tokens": 12094095.0, "reward": 0.03125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8623818159103394, "sampling/importance_sampling_ratio/mean": 1.0001580715179443, "sampling/importance_sampling_ratio/min": 0.41106587648391724, "sampling/sampling_logp_difference/max": 0.8890018463134766, "sampling/sampling_logp_difference/mean": 0.007972916588187218, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 535.078125, "completions/mean_terminated_length": 535.078125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.37493422627449036, "epoch": 0.5203539823008849, "frac_reward_zero_std": 0.5, "grad_norm": 1.0725972680996103, "kl": 0.009139752015471458, "learning_rate": 9.97325147384513e-07, "loss": 0.0152, "num_tokens": 12161652.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6375312805175781, "sampling/importance_sampling_ratio/mean": 1.0003275871276855, "sampling/importance_sampling_ratio/min": 0.5092880725860596, "sampling/sampling_logp_difference/max": 0.6747415065765381, "sampling/sampling_logp_difference/mean": 0.016074657440185547, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 238.84375, "completions/mean_terminated_length": 238.84375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.10852019488811493, "epoch": 0.5221238938053098, "frac_reward_zero_std": 1.0, "grad_norm": 0.08008653913469914, "kl": 0.017791396006941795, "learning_rate": 9.97244779128571e-07, "loss": 0.0002, "num_tokens": 12186250.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002869367599487, "sampling/importance_sampling_ratio/min": 0.517909049987793, "sampling/sampling_logp_difference/max": 0.9517800807952881, "sampling/sampling_logp_difference/mean": 0.00971977598965168, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 3921.0, "completions/mean_length": 703.265625, "completions/mean_terminated_length": 635.0635375976562, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.10994003713130951, "epoch": 0.5238938053097345, "frac_reward_zero_std": 0.5, "grad_norm": 0.6797167730653914, "kl": 0.012237188406288624, "learning_rate": 9.971632246451127e-07, "loss": 0.118, "num_tokens": 12241659.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.941283106803894, "sampling/importance_sampling_ratio/mean": 0.9992427825927734, "sampling/importance_sampling_ratio/min": 0.5172837376594543, "sampling/sampling_logp_difference/max": 0.6633491516113281, "sampling/sampling_logp_difference/mean": 0.008548062294721603, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 671.125, "completions/mean_terminated_length": 671.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.17750373482704163, "epoch": 0.5256637168141592, "frac_reward_zero_std": 0.25, "grad_norm": 1.0435780027455468, "kl": 0.009927651844918728, "learning_rate": 9.970804841286953e-07, "loss": 0.075, "num_tokens": 12296051.0, "reward": 0.375, "reward_std": 0.644389271736145, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994956254959106, "sampling/importance_sampling_ratio/min": 0.3591991662979126, "sampling/sampling_logp_difference/max": 1.0238783359527588, "sampling/sampling_logp_difference/mean": 0.010681826621294022, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 182.421875, "completions/mean_terminated_length": 182.421875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.13422825932502747, "epoch": 0.5274336283185841, "frac_reward_zero_std": 1.0, "grad_norm": 0.10897359838777211, "kl": 0.02298707887530327, "learning_rate": 9.96996557776704e-07, "loss": 0.0002, "num_tokens": 12316766.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.944155216217041, "sampling/importance_sampling_ratio/mean": 0.999586820602417, "sampling/importance_sampling_ratio/min": 0.5091027021408081, "sampling/sampling_logp_difference/max": 0.6751055717468262, "sampling/sampling_logp_difference/mean": 0.013117527589201927, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4939.0, "completions/max_terminated_length": 4939.0, "completions/mean_length": 787.390625, "completions/mean_terminated_length": 787.390625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.2797994911670685, "epoch": 0.5292035398230088, "frac_reward_zero_std": 0.25, "grad_norm": 0.9605206882941023, "kl": 0.01125551201403141, "learning_rate": 9.969114457893539e-07, "loss": 0.13, "num_tokens": 12380151.0, "reward": 0.53125, "reward_std": 0.6373475193977356, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002079010009766, "sampling/importance_sampling_ratio/min": 0.29841703176498413, "sampling/sampling_logp_difference/max": 1.2092633247375488, "sampling/sampling_logp_difference/mean": 0.013682771474123001, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 747.015625, "completions/mean_terminated_length": 747.015625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.2424364686012268, "epoch": 0.5309734513274337, "frac_reward_zero_std": 0.25, "grad_norm": 1.2136729366307772, "kl": 0.008399413898587227, "learning_rate": 9.96825148369688e-07, "loss": -0.0288, "num_tokens": 12442216.0, "reward": 0.59375, "reward_std": 0.6601393222808838, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001249313354492, "sampling/importance_sampling_ratio/min": 0.3163878321647644, "sampling/sampling_logp_difference/max": 1.1531651020050049, "sampling/sampling_logp_difference/mean": 0.012584454379975796, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 440.328125, "completions/mean_terminated_length": 440.328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.23205266892910004, "epoch": 0.5327433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.039605910061623194, "kl": 0.01647520251572132, "learning_rate": 9.967376657235778e-07, "loss": 0.0001, "num_tokens": 12482957.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7394620180130005, "sampling/importance_sampling_ratio/mean": 1.0002424716949463, "sampling/importance_sampling_ratio/min": 0.3885059058666229, "sampling/sampling_logp_difference/max": 0.9454469084739685, "sampling/sampling_logp_difference/mean": 0.013255949132144451, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 529.28125, "completions/mean_terminated_length": 529.28125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.284460186958313, "epoch": 0.5345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 1.3604811972006157, "kl": 0.011310039088129997, "learning_rate": 9.966489980597217e-07, "loss": 0.0756, "num_tokens": 12529055.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6598358154296875, "sampling/importance_sampling_ratio/mean": 1.0001940727233887, "sampling/importance_sampling_ratio/min": 0.5661828517913818, "sampling/sampling_logp_difference/max": 0.5688381195068359, "sampling/sampling_logp_difference/mean": 0.014394586905837059, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 431.78125, "completions/mean_terminated_length": 431.78125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.27201589941978455, "epoch": 0.536283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 1.3544324493465876, "kl": 0.011836733669042587, "learning_rate": 9.965591455896455e-07, "loss": 0.0816, "num_tokens": 12569153.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6233726739883423, "sampling/importance_sampling_ratio/mean": 1.000396728515625, "sampling/importance_sampling_ratio/min": 0.5362957715988159, "sampling/sampling_logp_difference/max": 0.6230695247650146, "sampling/sampling_logp_difference/mean": 0.014330549165606499, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 510.078125, "completions/mean_terminated_length": 510.078125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.14797988533973694, "epoch": 0.5380530973451327, "frac_reward_zero_std": 0.5, "grad_norm": 1.074140948822202, "kl": 0.011660570278763771, "learning_rate": 9.964681085277011e-07, "loss": -0.0961, "num_tokens": 12611622.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.622308373451233, "sampling/importance_sampling_ratio/mean": 0.9994902610778809, "sampling/importance_sampling_ratio/min": 0.397103488445282, "sampling/sampling_logp_difference/max": 0.9235583543777466, "sampling/sampling_logp_difference/mean": 0.009174825623631477, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 307.765625, "completions/mean_terminated_length": 307.765625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.15618465840816498, "epoch": 0.5398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 1.3728201511150802, "kl": 0.0104716457426548, "learning_rate": 9.96375887091067e-07, "loss": 0.0559, "num_tokens": 12641543.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8187601566314697, "sampling/importance_sampling_ratio/mean": 1.0002257823944092, "sampling/importance_sampling_ratio/min": 0.2952064275741577, "sampling/sampling_logp_difference/max": 1.2200803756713867, "sampling/sampling_logp_difference/mean": 0.01298457756638527, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2445.0, "completions/max_terminated_length": 2445.0, "completions/mean_length": 573.828125, "completions/mean_terminated_length": 573.828125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.24914200603961945, "epoch": 0.5415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.6615659102351309, "kl": 0.00677898246794939, "learning_rate": 9.962824814997464e-07, "loss": 0.0044, "num_tokens": 12689372.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.988453984260559, "sampling/importance_sampling_ratio/mean": 0.9999870657920837, "sampling/importance_sampling_ratio/min": 0.47600531578063965, "sampling/sampling_logp_difference/max": 0.7423262596130371, "sampling/sampling_logp_difference/mean": 0.012273651547729969, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2917.0, "completions/max_terminated_length": 2917.0, "completions/mean_length": 711.203125, "completions/mean_terminated_length": 711.203125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.21183796226978302, "epoch": 0.5433628318584071, "frac_reward_zero_std": 0.5, "grad_norm": 0.9651032865216309, "kl": 0.00942046195268631, "learning_rate": 9.961878919765677e-07, "loss": 0.0514, "num_tokens": 12745401.0, "reward": 0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998499155044556, "sampling/importance_sampling_ratio/min": 0.377798467874527, "sampling/sampling_logp_difference/max": 0.9733943939208984, "sampling/sampling_logp_difference/mean": 0.012394905090332031, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 527.34375, "completions/mean_terminated_length": 527.34375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.18508289754390717, "epoch": 0.5451327433628319, "frac_reward_zero_std": 0.5, "grad_norm": 1.1837164984860453, "kl": 0.015924129635095596, "learning_rate": 9.96092118747184e-07, "loss": 0.1054, "num_tokens": 12789679.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.9660207033157349, "sampling/importance_sampling_ratio/mean": 1.0000438690185547, "sampling/importance_sampling_ratio/min": 0.3680047392845154, "sampling/sampling_logp_difference/max": 0.999659538269043, "sampling/sampling_logp_difference/mean": 0.012409013696014881, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 421.734375, "completions/mean_terminated_length": 421.734375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.21359401941299438, "epoch": 0.5469026548672566, "frac_reward_zero_std": 1.0, "grad_norm": 0.02806654643833116, "kl": 0.008415073156356812, "learning_rate": 9.959951620400718e-07, "loss": 0.0001, "num_tokens": 12828958.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9656606912612915, "sampling/importance_sampling_ratio/mean": 1.0000615119934082, "sampling/importance_sampling_ratio/min": 0.37465327978134155, "sampling/sampling_logp_difference/max": 0.9817543029785156, "sampling/sampling_logp_difference/mean": 0.011181121692061424, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2212270349264145, "epoch": 0.5486725663716814, "frac_reward_zero_std": 0.5, "grad_norm": 1.2432668757950096, "kl": 0.01307586394250393, "learning_rate": 9.95897022086531e-07, "loss": 0.1522, "num_tokens": 12868222.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000258207321167, "sampling/importance_sampling_ratio/min": 0.49003100395202637, "sampling/sampling_logp_difference/max": 0.7132866382598877, "sampling/sampling_logp_difference/mean": 0.014409977942705154, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2857.0, "completions/max_terminated_length": 2857.0, "completions/mean_length": 820.0625, "completions/mean_terminated_length": 820.0625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2264469563961029, "epoch": 0.5504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 0.9104768899659973, "kl": 0.007818522863090038, "learning_rate": 9.957976991206845e-07, "loss": 0.0878, "num_tokens": 12933186.0, "reward": 0.0, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000213384628296, "sampling/importance_sampling_ratio/min": 0.26041948795318604, "sampling/sampling_logp_difference/max": 1.3454614877700806, "sampling/sampling_logp_difference/mean": 0.012134440243244171, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 356.28125, "completions/mean_terminated_length": 356.28125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.18492724001407623, "epoch": 0.552212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 1.2473987255658339, "kl": 0.010419107973575592, "learning_rate": 9.956971933794773e-07, "loss": -0.044, "num_tokens": 12966324.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997034072875977, "sampling/importance_sampling_ratio/min": 0.47512000799179077, "sampling/sampling_logp_difference/max": 0.9381264448165894, "sampling/sampling_logp_difference/mean": 0.011368905194103718, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 231.5625, "completions/mean_terminated_length": 231.5625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.11903505027294159, "epoch": 0.5539823008849557, "frac_reward_zero_std": 1.0, "grad_norm": 0.07785932283450525, "kl": 0.013626226224005222, "learning_rate": 9.955955051026758e-07, "loss": 0.0001, "num_tokens": 12991576.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7815014123916626, "sampling/importance_sampling_ratio/mean": 0.9996967315673828, "sampling/importance_sampling_ratio/min": 0.2422732561826706, "sampling/sampling_logp_difference/max": 1.4176890850067139, "sampling/sampling_logp_difference/mean": 0.011381693184375763, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 505.9375, "completions/mean_terminated_length": 505.9375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.23862990736961365, "epoch": 0.5557522123893806, "frac_reward_zero_std": 0.5, "grad_norm": 1.1686585540040413, "kl": 0.013976803049445152, "learning_rate": 9.954926345328678e-07, "loss": 0.0045, "num_tokens": 13035428.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.839874029159546, "sampling/importance_sampling_ratio/mean": 1.0001561641693115, "sampling/importance_sampling_ratio/min": 0.4834783673286438, "sampling/sampling_logp_difference/max": 0.7267487049102783, "sampling/sampling_logp_difference/mean": 0.01270308904349804, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3119.0, "completions/max_terminated_length": 3119.0, "completions/mean_length": 949.84375, "completions/mean_terminated_length": 949.84375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.39717620611190796, "epoch": 0.5575221238938053, "frac_reward_zero_std": 0.0, "grad_norm": 0.86199478020297, "kl": 0.01172365341335535, "learning_rate": 9.953885819154614e-07, "loss": 0.0256, "num_tokens": 13116346.0, "reward": 0.21875, "reward_std": 0.815913200378418, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000624656677246, "sampling/importance_sampling_ratio/min": 0.38162150979042053, "sampling/sampling_logp_difference/max": 0.9633259773254395, "sampling/sampling_logp_difference/mean": 0.017032235860824585, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 331.09375, "completions/mean_terminated_length": 331.09375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.1988530457019806, "epoch": 0.5592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 1.5089419410585059, "kl": 0.012759874574840069, "learning_rate": 9.952833474986846e-07, "loss": 0.0015, "num_tokens": 13147936.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998801350593567, "sampling/importance_sampling_ratio/min": 0.26049110293388367, "sampling/sampling_logp_difference/max": 1.3451865911483765, "sampling/sampling_logp_difference/mean": 0.013721762225031853, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 298.328125, "completions/mean_terminated_length": 298.328125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1756179928779602, "epoch": 0.5610619469026549, "frac_reward_zero_std": 0.75, "grad_norm": 1.3509063440987192, "kl": 0.015254872851073742, "learning_rate": 9.951769315335843e-07, "loss": -0.057, "num_tokens": 13176693.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994232654571533, "sampling/importance_sampling_ratio/min": 0.1696796864271164, "sampling/sampling_logp_difference/max": 1.7738428115844727, "sampling/sampling_logp_difference/mean": 0.01323879323899746, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 583.609375, "completions/mean_terminated_length": 583.609375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.19607892632484436, "epoch": 0.5628318584070796, "frac_reward_zero_std": 0.5, "grad_norm": 1.0512415965047097, "kl": 0.013000204227864742, "learning_rate": 9.95069334274027e-07, "loss": 0.1006, "num_tokens": 13225180.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995521306991577, "sampling/importance_sampling_ratio/min": 0.2366308569908142, "sampling/sampling_logp_difference/max": 1.441253900527954, "sampling/sampling_logp_difference/mean": 0.012878907844424248, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 470.859375, "completions/mean_terminated_length": 470.859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2554301917552948, "epoch": 0.5646017699115045, "frac_reward_zero_std": 0.5, "grad_norm": 1.1262096813693891, "kl": 0.009215278550982475, "learning_rate": 9.949605559766967e-07, "loss": -0.0965, "num_tokens": 13268403.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005323886871338, "sampling/importance_sampling_ratio/min": 0.09647966176271439, "sampling/sampling_logp_difference/max": 2.338423013687134, "sampling/sampling_logp_difference/mean": 0.013021775521337986, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 664.53125, "completions/mean_terminated_length": 664.53125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2579624056816101, "epoch": 0.5663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 1.0281427651062698, "kl": 0.008488418534398079, "learning_rate": 9.94850596901095e-07, "loss": 0.0486, "num_tokens": 13323029.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.9099587202072144, "sampling/importance_sampling_ratio/mean": 0.9999596476554871, "sampling/importance_sampling_ratio/min": 0.503806471824646, "sampling/sampling_logp_difference/max": 0.6855630874633789, "sampling/sampling_logp_difference/mean": 0.012637540698051453, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2348.0, "completions/max_terminated_length": 2348.0, "completions/mean_length": 492.59375, "completions/mean_terminated_length": 492.59375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.273328959941864, "epoch": 0.5681415929203539, "frac_reward_zero_std": 0.5, "grad_norm": 1.0248120027426015, "kl": 0.013111264444887638, "learning_rate": 9.947394573095402e-07, "loss": -0.0875, "num_tokens": 13369051.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996887445449829, "sampling/importance_sampling_ratio/min": 0.5058367848396301, "sampling/sampling_logp_difference/max": 0.9265488386154175, "sampling/sampling_logp_difference/mean": 0.01450757123529911, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 238.703125, "completions/mean_terminated_length": 238.703125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.11889506131410599, "epoch": 0.5699115044247788, "frac_reward_zero_std": 1.0, "grad_norm": 0.04730135509532141, "kl": 0.011112641543149948, "learning_rate": 9.94627137467167e-07, "loss": 0.0001, "num_tokens": 13393576.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.817976713180542, "sampling/importance_sampling_ratio/mean": 1.0001864433288574, "sampling/importance_sampling_ratio/min": 0.5218455791473389, "sampling/sampling_logp_difference/max": 0.6503835320472717, "sampling/sampling_logp_difference/mean": 0.009882821701467037, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3742.0, "completions/max_terminated_length": 3742.0, "completions/mean_length": 1082.640625, "completions/mean_terminated_length": 1082.640625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.18591627478599548, "epoch": 0.5716814159292035, "frac_reward_zero_std": 0.25, "grad_norm": 0.6805962752480998, "kl": 0.009230997413396835, "learning_rate": 9.945136376419258e-07, "loss": 0.1167, "num_tokens": 13475729.0, "reward": 0.09375, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000861883163452, "sampling/importance_sampling_ratio/min": 0.13716651499271393, "sampling/sampling_logp_difference/max": 1.9865596294403076, "sampling/sampling_logp_difference/mean": 0.01027169730514288, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2212.0, "completions/max_terminated_length": 2212.0, "completions/mean_length": 418.578125, "completions/mean_terminated_length": 418.578125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.34236496686935425, "epoch": 0.5734513274336284, "frac_reward_zero_std": 0.75, "grad_norm": 0.8787804026135553, "kl": 0.011660914868116379, "learning_rate": 9.943989581045819e-07, "loss": -0.0019, "num_tokens": 13516582.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995412826538086, "sampling/importance_sampling_ratio/min": 0.4462634325027466, "sampling/sampling_logp_difference/max": 0.8068459033966064, "sampling/sampling_logp_difference/mean": 0.016282036900520325, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 755.5625, "completions/mean_terminated_length": 755.5625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.16003254055976868, "epoch": 0.5752212389380531, "frac_reward_zero_std": 0.25, "grad_norm": 1.0315982996563209, "kl": 0.009150151163339615, "learning_rate": 9.942830991287149e-07, "loss": 0.0196, "num_tokens": 13575242.0, "reward": 0.65625, "reward_std": 0.5986068248748779, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999913215637207, "sampling/importance_sampling_ratio/min": 0.011241849511861801, "sampling/sampling_logp_difference/max": 4.488111972808838, "sampling/sampling_logp_difference/mean": 0.010309720411896706, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 171.796875, "completions/mean_terminated_length": 171.796875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.11178592592477798, "epoch": 0.5769911504424778, "frac_reward_zero_std": 1.0, "grad_norm": 0.448101323205556, "kl": 0.026523999869823456, "learning_rate": 9.94166060990718e-07, "loss": 0.0002, "num_tokens": 13595101.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000042200088501, "sampling/importance_sampling_ratio/min": 0.3722725808620453, "sampling/sampling_logp_difference/max": 2.6337647438049316, "sampling/sampling_logp_difference/mean": 0.01143827848136425, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 329.46875, "completions/mean_terminated_length": 329.46875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.24242663383483887, "epoch": 0.5787610619469027, "frac_reward_zero_std": 0.75, "grad_norm": 1.0369298656025472, "kl": 0.010288412682712078, "learning_rate": 9.940478439697972e-07, "loss": 0.0526, "num_tokens": 13627739.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8064672946929932, "sampling/importance_sampling_ratio/mean": 0.9998229742050171, "sampling/importance_sampling_ratio/min": 0.5044726133346558, "sampling/sampling_logp_difference/max": 0.684241771697998, "sampling/sampling_logp_difference/mean": 0.01445348933339119, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 242.046875, "completions/mean_terminated_length": 242.046875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.12605658173561096, "epoch": 0.5805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 1.6141131066609518, "kl": 0.009912240318953991, "learning_rate": 9.939284483479715e-07, "loss": 0.0443, "num_tokens": 13653150.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6673932075500488, "sampling/importance_sampling_ratio/mean": 0.9998911619186401, "sampling/importance_sampling_ratio/min": 0.2681378722190857, "sampling/sampling_logp_difference/max": 1.3162540197372437, "sampling/sampling_logp_difference/mean": 0.011045117862522602, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 557.59375, "completions/mean_terminated_length": 557.59375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.24696922302246094, "epoch": 0.5823008849557522, "frac_reward_zero_std": 0.25, "grad_norm": 1.1846285135694103, "kl": 0.013814765959978104, "learning_rate": 9.93807874410071e-07, "loss": 0.0634, "num_tokens": 13700916.0, "reward": 0.34375, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997166395187378, "sampling/importance_sampling_ratio/min": 0.4596972167491913, "sampling/sampling_logp_difference/max": 0.7771872282028198, "sampling/sampling_logp_difference/mean": 0.014028158038854599, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 444.75, "completions/mean_terminated_length": 444.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3225674629211426, "epoch": 0.584070796460177, "frac_reward_zero_std": 0.5, "grad_norm": 1.360311798546827, "kl": 0.010874364525079727, "learning_rate": 9.936861224437372e-07, "loss": 0.0206, "num_tokens": 13742532.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.814037799835205, "sampling/importance_sampling_ratio/mean": 1.0001300573349, "sampling/importance_sampling_ratio/min": 0.32929152250289917, "sampling/sampling_logp_difference/max": 1.1108118295669556, "sampling/sampling_logp_difference/mean": 0.017508871853351593, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 393.984375, "completions/mean_terminated_length": 393.984375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.14725276827812195, "epoch": 0.5858407079646017, "frac_reward_zero_std": 0.5, "grad_norm": 1.1879498598258116, "kl": 0.009119031950831413, "learning_rate": 9.935631927394214e-07, "loss": 0.0132, "num_tokens": 13779235.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000172853469849, "sampling/importance_sampling_ratio/min": 0.548204243183136, "sampling/sampling_logp_difference/max": 1.0744800567626953, "sampling/sampling_logp_difference/mean": 0.010520167648792267, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 542.03125, "completions/mean_terminated_length": 542.03125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.26583829522132874, "epoch": 0.5876106194690266, "frac_reward_zero_std": 0.75, "grad_norm": 1.0244834936557765, "kl": 0.012551561929285526, "learning_rate": 9.934390855903852e-07, "loss": 0.1692, "num_tokens": 13827813.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7332918643951416, "sampling/importance_sampling_ratio/mean": 0.9997817277908325, "sampling/importance_sampling_ratio/min": 0.3822780251502991, "sampling/sampling_logp_difference/max": 0.9616070985794067, "sampling/sampling_logp_difference/mean": 0.014081464149057865, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 719.359375, "completions/mean_terminated_length": 719.359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.1774405539035797, "epoch": 0.5893805309734513, "frac_reward_zero_std": 0.5, "grad_norm": 0.9521658366444113, "kl": 0.010963380336761475, "learning_rate": 9.93313801292698e-07, "loss": -0.0255, "num_tokens": 13883772.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.987760066986084, "sampling/importance_sampling_ratio/mean": 1.0009305477142334, "sampling/importance_sampling_ratio/min": 0.11002536118030548, "sampling/sampling_logp_difference/max": 2.2070443630218506, "sampling/sampling_logp_difference/mean": 0.01233024150133133, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 490.828125, "completions/mean_terminated_length": 490.828125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.18938961625099182, "epoch": 0.5911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.308763267842844, "kl": 0.010846241377294064, "learning_rate": 9.93187340145239e-07, "loss": -0.0509, "num_tokens": 13926337.0, "reward": 0.3125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000192642211914, "sampling/importance_sampling_ratio/min": 0.019511757418513298, "sampling/sampling_logp_difference/max": 3.9367380142211914, "sampling/sampling_logp_difference/mean": 0.013916685245931149, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 327.484375, "completions/mean_terminated_length": 327.484375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.1326397955417633, "epoch": 0.5929203539823009, "frac_reward_zero_std": 0.75, "grad_norm": 1.0452303310850333, "kl": 0.011152435094118118, "learning_rate": 9.93059702449693e-07, "loss": -0.1173, "num_tokens": 13957120.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002830028533936, "sampling/importance_sampling_ratio/min": 0.340989887714386, "sampling/sampling_logp_difference/max": 1.0759024620056152, "sampling/sampling_logp_difference/mean": 0.01105023454874754, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 221.21875, "completions/mean_terminated_length": 221.21875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.22370436787605286, "epoch": 0.5946902654867257, "frac_reward_zero_std": 0.75, "grad_norm": 1.2728473571169403, "kl": 0.017372138798236847, "learning_rate": 9.929308885105534e-07, "loss": 0.0282, "num_tokens": 13982462.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6805295944213867, "sampling/importance_sampling_ratio/mean": 0.9995006918907166, "sampling/importance_sampling_ratio/min": 0.6141542196273804, "sampling/sampling_logp_difference/max": 0.5191090106964111, "sampling/sampling_logp_difference/mean": 0.015102004632353783, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 251.578125, "completions/mean_terminated_length": 251.578125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.21504616737365723, "epoch": 0.5964601769911504, "frac_reward_zero_std": 0.75, "grad_norm": 2.0389382114181727, "kl": 0.013353358954191208, "learning_rate": 9.928008986351186e-07, "loss": 0.2258, "num_tokens": 14009075.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000449419021606, "sampling/importance_sampling_ratio/min": 0.3985477089881897, "sampling/sampling_logp_difference/max": 0.9199280738830566, "sampling/sampling_logp_difference/mean": 0.014716576784849167, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 153.5625, "completions/mean_terminated_length": 153.5625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.1429508626461029, "epoch": 0.5982300884955752, "frac_reward_zero_std": 1.0, "grad_norm": 0.09993219114757654, "kl": 0.01585269346833229, "learning_rate": 9.926697331334924e-07, "loss": 0.0001, "num_tokens": 14029351.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7964189052581787, "sampling/importance_sampling_ratio/mean": 1.000233769416809, "sampling/importance_sampling_ratio/min": 0.3985702395439148, "sampling/sampling_logp_difference/max": 0.9198715686798096, "sampling/sampling_logp_difference/mean": 0.013787386007606983, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 195.5625, "completions/mean_terminated_length": 195.5625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1372258961200714, "epoch": 0.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.062205105311795725, "kl": 0.019604679197072983, "learning_rate": 9.925373923185834e-07, "loss": 0.0002, "num_tokens": 14051307.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7434173822402954, "sampling/importance_sampling_ratio/mean": 1.0005011558532715, "sampling/importance_sampling_ratio/min": 0.38532912731170654, "sampling/sampling_logp_difference/max": 0.9536573886871338, "sampling/sampling_logp_difference/mean": 0.012423742562532425, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 191.515625, "completions/mean_terminated_length": 191.515625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.10810220241546631, "epoch": 0.6017699115044248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849850452697075, "kl": 0.01524887140840292, "learning_rate": 9.92403876506104e-07, "loss": 0.0001, "num_tokens": 14074076.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004429817199707, "sampling/importance_sampling_ratio/min": 0.4602620601654053, "sampling/sampling_logp_difference/max": 0.924720287322998, "sampling/sampling_logp_difference/mean": 0.010313036851584911, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2645533084869385, "epoch": 0.6035398230088496, "frac_reward_zero_std": 0.75, "grad_norm": 1.4704809307199154, "kl": 0.020931486040353775, "learning_rate": 9.922691860145696e-07, "loss": 0.05, "num_tokens": 14101396.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8164737224578857, "sampling/importance_sampling_ratio/mean": 1.0003548860549927, "sampling/importance_sampling_ratio/min": 0.44175654649734497, "sampling/sampling_logp_difference/max": 0.8169963359832764, "sampling/sampling_logp_difference/mean": 0.016569077968597412, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.29909226298332214, "epoch": 0.6053097345132743, "frac_reward_zero_std": 0.75, "grad_norm": 1.798389624588574, "kl": 0.03648889809846878, "learning_rate": 9.921333211652977e-07, "loss": -0.003, "num_tokens": 14129308.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5882289409637451, "sampling/importance_sampling_ratio/mean": 0.9999880194664001, "sampling/importance_sampling_ratio/min": 0.025496045127511024, "sampling/sampling_logp_difference/max": 3.66923189163208, "sampling/sampling_logp_difference/mean": 0.017304111272096634, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1300.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 292.6875, "completions/mean_terminated_length": 292.6875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.13682976365089417, "epoch": 0.6070796460176991, "frac_reward_zero_std": 0.75, "grad_norm": 1.1878404701491212, "kl": 0.013012914918363094, "learning_rate": 9.919962822824083e-07, "loss": -0.0226, "num_tokens": 14158200.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.671028733253479, "sampling/importance_sampling_ratio/mean": 1.0004812479019165, "sampling/importance_sampling_ratio/min": 0.5128996968269348, "sampling/sampling_logp_difference/max": 0.6676750183105469, "sampling/sampling_logp_difference/mean": 0.01002402976155281, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 319.15625, "completions/mean_terminated_length": 319.15625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.17892402410507202, "epoch": 0.6088495575221239, "frac_reward_zero_std": 0.5, "grad_norm": 1.417537890360123, "kl": 0.013483471237123013, "learning_rate": 9.918580696928205e-07, "loss": 0.0491, "num_tokens": 14190514.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000501871109009, "sampling/importance_sampling_ratio/min": 0.06522953510284424, "sampling/sampling_logp_difference/max": 2.7298429012298584, "sampling/sampling_logp_difference/mean": 0.013468679040670395, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 391.578125, "completions/mean_terminated_length": 391.578125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.26512667536735535, "epoch": 0.6106194690265486, "frac_reward_zero_std": 0.75, "grad_norm": 1.2891112537965543, "kl": 0.016914675012230873, "learning_rate": 9.91718683726255e-07, "loss": 0.0171, "num_tokens": 14228295.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000093936920166, "sampling/importance_sampling_ratio/min": 0.3985431492328644, "sampling/sampling_logp_difference/max": 0.9518356323242188, "sampling/sampling_logp_difference/mean": 0.017220504581928253, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2248.0, "completions/max_terminated_length": 2248.0, "completions/mean_length": 576.953125, "completions/mean_terminated_length": 576.953125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.23701117932796478, "epoch": 0.6123893805309735, "frac_reward_zero_std": 0.5, "grad_norm": 0.8364177832875549, "kl": 0.01401478610932827, "learning_rate": 9.915781247152308e-07, "loss": -0.0371, "num_tokens": 14276548.0, "reward": 0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.782973289489746, "sampling/importance_sampling_ratio/mean": 0.9997231960296631, "sampling/importance_sampling_ratio/min": 0.30391907691955566, "sampling/sampling_logp_difference/max": 1.1909937858581543, "sampling/sampling_logp_difference/mean": 0.013363107107579708, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 179.015625, "completions/mean_terminated_length": 179.015625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.2167108654975891, "epoch": 0.6141592920353982, "frac_reward_zero_std": 1.0, "grad_norm": 0.08467898036234706, "kl": 0.02094550058245659, "learning_rate": 9.914363929950657e-07, "loss": 0.0002, "num_tokens": 14298565.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6139919757843018, "sampling/importance_sampling_ratio/mean": 0.9987689256668091, "sampling/importance_sampling_ratio/min": 0.47411179542541504, "sampling/sampling_logp_difference/max": 0.746312141418457, "sampling/sampling_logp_difference/mean": 0.014694158919155598, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 216.171875, "completions/mean_terminated_length": 216.171875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.30640822649002075, "epoch": 0.6159292035398231, "frac_reward_zero_std": 1.0, "grad_norm": 0.052084246697353044, "kl": 0.017480291426181793, "learning_rate": 9.91293488903875e-07, "loss": 0.0002, "num_tokens": 14323744.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9441547393798828, "sampling/importance_sampling_ratio/mean": 0.9994668364524841, "sampling/importance_sampling_ratio/min": 0.5560150742530823, "sampling/sampling_logp_difference/max": 0.6648273468017578, "sampling/sampling_logp_difference/mean": 0.017224781215190887, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 274.203125, "completions/mean_terminated_length": 274.203125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2392423003911972, "epoch": 0.6176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.2604944381214376, "kl": 0.013026298023760319, "learning_rate": 9.91149412782571e-07, "loss": 0.0194, "num_tokens": 14352573.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000708818435669, "sampling/importance_sampling_ratio/min": 0.27824246883392334, "sampling/sampling_logp_difference/max": 1.2792623043060303, "sampling/sampling_logp_difference/mean": 0.014881562441587448, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 254.09375, "completions/mean_terminated_length": 254.09375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.15329664945602417, "epoch": 0.6194690265486725, "frac_reward_zero_std": 0.75, "grad_norm": 1.3502289271563663, "kl": 0.014477971009910107, "learning_rate": 9.910041649748612e-07, "loss": -0.1485, "num_tokens": 14377395.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001614093780518, "sampling/importance_sampling_ratio/min": 0.46053168177604675, "sampling/sampling_logp_difference/max": 0.8773808479309082, "sampling/sampling_logp_difference/mean": 0.012453576549887657, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3625.0, "completions/max_terminated_length": 3625.0, "completions/mean_length": 567.078125, "completions/mean_terminated_length": 567.078125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.21548616886138916, "epoch": 0.6212389380530974, "frac_reward_zero_std": 0.75, "grad_norm": 0.9556264159211916, "kl": 0.012173894792795181, "learning_rate": 9.908577458272495e-07, "loss": -0.0292, "num_tokens": 14425320.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003358125686646, "sampling/importance_sampling_ratio/min": 0.4372006058692932, "sampling/sampling_logp_difference/max": 0.8349273204803467, "sampling/sampling_logp_difference/mean": 0.013012798503041267, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 429.28125, "completions/mean_terminated_length": 429.28125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.21009160578250885, "epoch": 0.6230088495575221, "frac_reward_zero_std": 0.75, "grad_norm": 0.9569497757507517, "kl": 0.011812455952167511, "learning_rate": 9.907101556890331e-07, "loss": -0.0353, "num_tokens": 14462170.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5973292589187622, "sampling/importance_sampling_ratio/mean": 0.9993290901184082, "sampling/importance_sampling_ratio/min": 0.22330020368099213, "sampling/sampling_logp_difference/max": 1.4992382526397705, "sampling/sampling_logp_difference/mean": 0.011548706330358982, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 278.046875, "completions/mean_terminated_length": 278.046875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.25298774242401123, "epoch": 0.6247787610619469, "frac_reward_zero_std": 0.5, "grad_norm": 2.0426691030022694, "kl": 0.01194531749933958, "learning_rate": 9.905613949123034e-07, "loss": 0.0966, "num_tokens": 14489949.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.953004002571106, "sampling/importance_sampling_ratio/mean": 0.9996095895767212, "sampling/importance_sampling_ratio/min": 0.4238704442977905, "sampling/sampling_logp_difference/max": 0.8583273887634277, "sampling/sampling_logp_difference/mean": 0.014975040219724178, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.2105671912431717, "epoch": 0.6265486725663717, "frac_reward_zero_std": 0.5, "grad_norm": 1.5874993675026383, "kl": 0.012528583407402039, "learning_rate": 9.904114638519443e-07, "loss": 0.041, "num_tokens": 14520165.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.664654016494751, "sampling/importance_sampling_ratio/mean": 1.0000356435775757, "sampling/importance_sampling_ratio/min": 0.47478923201560974, "sampling/sampling_logp_difference/max": 0.7448842525482178, "sampling/sampling_logp_difference/mean": 0.013717800378799438, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 197.484375, "completions/mean_terminated_length": 197.484375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.13267138600349426, "epoch": 0.6283185840707964, "frac_reward_zero_std": 1.0, "grad_norm": 0.08725406084659827, "kl": 0.01387047953903675, "learning_rate": 9.902603628656311e-07, "loss": 0.0001, "num_tokens": 14542548.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000417232513428, "sampling/importance_sampling_ratio/min": 0.34454336762428284, "sampling/sampling_logp_difference/max": 1.065535306930542, "sampling/sampling_logp_difference/mean": 0.012317731976509094, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 326.84375, "completions/mean_terminated_length": 326.84375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2077137678861618, "epoch": 0.6300884955752213, "frac_reward_zero_std": 0.75, "grad_norm": 1.71130363750837, "kl": 0.01125291083008051, "learning_rate": 9.901080923138308e-07, "loss": 0.0509, "num_tokens": 14586394.0, "reward": -0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002572536468506, "sampling/importance_sampling_ratio/min": 0.1258518099784851, "sampling/sampling_logp_difference/max": 2.072650194168091, "sampling/sampling_logp_difference/mean": 0.015072090551257133, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 571.28125, "completions/mean_terminated_length": 571.28125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.20199556648731232, "epoch": 0.631858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 0.7609865860139005, "kl": 0.010100730694830418, "learning_rate": 9.899546525597997e-07, "loss": -0.0472, "num_tokens": 14633372.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6293903589248657, "sampling/importance_sampling_ratio/mean": 1.0001381635665894, "sampling/importance_sampling_ratio/min": 0.481094092130661, "sampling/sampling_logp_difference/max": 0.7316924333572388, "sampling/sampling_logp_difference/mean": 0.011894459836184978, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 222.671875, "completions/mean_terminated_length": 222.671875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.14957398176193237, "epoch": 0.6336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0720779516785909, "kl": 0.010819360613822937, "learning_rate": 9.898000439695843e-07, "loss": 0.0001, "num_tokens": 14657383.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9984450340270996, "sampling/importance_sampling_ratio/min": 0.19423478841781616, "sampling/sampling_logp_difference/max": 1.6386876106262207, "sampling/sampling_logp_difference/mean": 0.013828027062118053, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 227.546875, "completions/mean_terminated_length": 227.546875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.23751753568649292, "epoch": 0.6353982300884956, "frac_reward_zero_std": 0.75, "grad_norm": 1.3536541441142882, "kl": 0.01518835686147213, "learning_rate": 9.896442669120187e-07, "loss": -0.0506, "num_tokens": 14682138.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991243481636047, "sampling/importance_sampling_ratio/min": 0.14807982742786407, "sampling/sampling_logp_difference/max": 1.9100037813186646, "sampling/sampling_logp_difference/mean": 0.015778452157974243, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 251.421875, "completions/mean_terminated_length": 251.421875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.20318467915058136, "epoch": 0.6371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 1.5696067740768211, "kl": 0.010682277381420135, "learning_rate": 9.894873217587245e-07, "loss": -0.0034, "num_tokens": 14711573.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995542168617249, "sampling/importance_sampling_ratio/min": 0.4463290274143219, "sampling/sampling_logp_difference/max": 0.8066989183425903, "sampling/sampling_logp_difference/mean": 0.012852566316723824, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 524.5625, "completions/mean_terminated_length": 524.5625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.16243481636047363, "epoch": 0.6389380530973451, "frac_reward_zero_std": 0.25, "grad_norm": 1.2943939181504063, "kl": 0.007710786536335945, "learning_rate": 9.893292088841108e-07, "loss": 0.1142, "num_tokens": 14756665.0, "reward": 0.40625, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998276233673096, "sampling/importance_sampling_ratio/min": 0.4166608154773712, "sampling/sampling_logp_difference/max": 0.8908207416534424, "sampling/sampling_logp_difference/mean": 0.010801361873745918, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3196.0, "completions/max_terminated_length": 3196.0, "completions/mean_length": 564.984375, "completions/mean_terminated_length": 564.984375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.30197763442993164, "epoch": 0.6407079646017699, "frac_reward_zero_std": 0.75, "grad_norm": 0.5629376959279275, "kl": 0.01049356535077095, "learning_rate": 9.891699286653712e-07, "loss": 0.0443, "num_tokens": 14807096.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004420280456543, "sampling/importance_sampling_ratio/min": 0.31996339559555054, "sampling/sampling_logp_difference/max": 1.139548659324646, "sampling/sampling_logp_difference/mean": 0.016110684722661972, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 433.375, "completions/mean_terminated_length": 433.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.23928192257881165, "epoch": 0.6424778761061947, "frac_reward_zero_std": 0.5, "grad_norm": 1.1810062722898038, "kl": 0.011036327108740807, "learning_rate": 9.890094814824852e-07, "loss": 0.0066, "num_tokens": 14846752.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995739459991455, "sampling/importance_sampling_ratio/min": 0.3292270004749298, "sampling/sampling_logp_difference/max": 1.111007809638977, "sampling/sampling_logp_difference/mean": 0.013281682506203651, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 320.375, "completions/mean_terminated_length": 320.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.18028739094734192, "epoch": 0.6442477876106195, "frac_reward_zero_std": 0.75, "grad_norm": 1.4688830576255838, "kl": 0.00951947458088398, "learning_rate": 9.888478677182154e-07, "loss": -0.2192, "num_tokens": 14878760.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002232789993286, "sampling/importance_sampling_ratio/min": 0.35683271288871765, "sampling/sampling_logp_difference/max": 1.0304882526397705, "sampling/sampling_logp_difference/mean": 0.012088914401829243, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 278.140625, "completions/mean_terminated_length": 278.140625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3358258008956909, "epoch": 0.6460176991150443, "frac_reward_zero_std": 0.75, "grad_norm": 1.207788145507444, "kl": 0.014012495055794716, "learning_rate": 9.886850877581078e-07, "loss": 0.047, "num_tokens": 14908929.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9750064611434937, "sampling/importance_sampling_ratio/mean": 1.0006589889526367, "sampling/importance_sampling_ratio/min": 0.37914684414863586, "sampling/sampling_logp_difference/max": 0.9698317050933838, "sampling/sampling_logp_difference/mean": 0.01827915385365486, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.23873525857925415, "epoch": 0.647787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.04689283269134896, "kl": 0.013006167486310005, "learning_rate": 9.885211419904903e-07, "loss": 0.0001, "num_tokens": 14930641.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7595782279968262, "sampling/importance_sampling_ratio/mean": 0.9998046159744263, "sampling/importance_sampling_ratio/min": 0.3320881426334381, "sampling/sampling_logp_difference/max": 1.102354884147644, "sampling/sampling_logp_difference/mean": 0.01442858949303627, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 366.84375, "completions/mean_terminated_length": 366.84375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.24768343567848206, "epoch": 0.6495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 1.1925588528124242, "kl": 0.01074390672147274, "learning_rate": 9.883560308064722e-07, "loss": -0.0988, "num_tokens": 14966711.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.8623803853988647, "sampling/importance_sampling_ratio/mean": 1.0000330209732056, "sampling/importance_sampling_ratio/min": 0.48893657326698303, "sampling/sampling_logp_difference/max": 0.7155225276947021, "sampling/sampling_logp_difference/mean": 0.013629784807562828, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 478.125, "completions/mean_terminated_length": 478.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.21380802989006042, "epoch": 0.6513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 0.8798287984037418, "kl": 0.008279915899038315, "learning_rate": 9.881897545999429e-07, "loss": -0.0277, "num_tokens": 15008655.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006177425384521, "sampling/importance_sampling_ratio/min": 0.4824385941028595, "sampling/sampling_logp_difference/max": 0.7438559532165527, "sampling/sampling_logp_difference/mean": 0.011781584471464157, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 400.03125, "completions/mean_terminated_length": 400.03125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.28701162338256836, "epoch": 0.6530973451327433, "frac_reward_zero_std": 0.5, "grad_norm": 1.2315538933934043, "kl": 0.009177477099001408, "learning_rate": 9.880223137675707e-07, "loss": 0.0908, "num_tokens": 15045649.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5940182209014893, "sampling/importance_sampling_ratio/mean": 0.9998616576194763, "sampling/importance_sampling_ratio/min": 0.20063836872577667, "sampling/sampling_logp_difference/max": 1.6062511205673218, "sampling/sampling_logp_difference/mean": 0.015226058661937714, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2103.0, "completions/max_terminated_length": 2103.0, "completions/mean_length": 721.65625, "completions/mean_terminated_length": 721.65625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.30868470668792725, "epoch": 0.6548672566371682, "frac_reward_zero_std": 0.25, "grad_norm": 1.0624088660511393, "kl": 0.00980185903608799, "learning_rate": 9.87853708708803e-07, "loss": 0.1651, "num_tokens": 15107611.0, "reward": 0.78125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003867149353027, "sampling/importance_sampling_ratio/min": 0.494928777217865, "sampling/sampling_logp_difference/max": 0.8811900615692139, "sampling/sampling_logp_difference/mean": 0.015322331339120865, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 430.796875, "completions/mean_terminated_length": 430.796875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.19472968578338623, "epoch": 0.6566371681415929, "frac_reward_zero_std": 0.5, "grad_norm": 1.432506722294396, "kl": 0.009185334667563438, "learning_rate": 9.876839398258639e-07, "loss": -0.1575, "num_tokens": 15146702.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8625503778457642, "sampling/importance_sampling_ratio/mean": 1.0002678632736206, "sampling/importance_sampling_ratio/min": 0.2752915322780609, "sampling/sampling_logp_difference/max": 1.2899246215820312, "sampling/sampling_logp_difference/mean": 0.011498003266751766, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 263.53125, "completions/mean_terminated_length": 263.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2756379246711731, "epoch": 0.6584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 1.4041305414732632, "kl": 0.012788272462785244, "learning_rate": 9.875130075237543e-07, "loss": -0.043, "num_tokens": 15174256.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.938795566558838, "sampling/importance_sampling_ratio/mean": 1.000593900680542, "sampling/importance_sampling_ratio/min": 0.4785598814487457, "sampling/sampling_logp_difference/max": 0.7369740009307861, "sampling/sampling_logp_difference/mean": 0.015793636441230774, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 501.984375, "completions/mean_terminated_length": 501.984375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3392029106616974, "epoch": 0.6601769911504425, "frac_reward_zero_std": 0.75, "grad_norm": 0.553602846225426, "kl": 0.009268195368349552, "learning_rate": 9.873409122102503e-07, "loss": -0.0109, "num_tokens": 15219423.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5468438863754272, "sampling/importance_sampling_ratio/mean": 0.9998815059661865, "sampling/importance_sampling_ratio/min": 0.6136589050292969, "sampling/sampling_logp_difference/max": 0.48831605911254883, "sampling/sampling_logp_difference/mean": 0.016094153746962547, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 256.203125, "completions/mean_terminated_length": 256.203125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3132248818874359, "epoch": 0.6619469026548672, "frac_reward_zero_std": 0.75, "grad_norm": 1.333393820024105, "kl": 0.014019379392266273, "learning_rate": 9.87167654295903e-07, "loss": 0.015, "num_tokens": 15248764.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.630755066871643, "sampling/importance_sampling_ratio/mean": 1.0001893043518066, "sampling/importance_sampling_ratio/min": 0.32394397258758545, "sampling/sampling_logp_difference/max": 1.1271847486495972, "sampling/sampling_logp_difference/mean": 0.018926570191979408, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4926.0, "completions/max_terminated_length": 4926.0, "completions/mean_length": 755.53125, "completions/mean_terminated_length": 755.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.262677937746048, "epoch": 0.6637168141592921, "frac_reward_zero_std": 0.5, "grad_norm": 0.8494869654112495, "kl": 0.008351258933544159, "learning_rate": 9.869932341940358e-07, "loss": -0.013, "num_tokens": 15309294.0, "reward": 0.40625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6620516777038574, "sampling/importance_sampling_ratio/mean": 1.0000134706497192, "sampling/importance_sampling_ratio/min": 0.08421832323074341, "sampling/sampling_logp_difference/max": 2.4743428230285645, "sampling/sampling_logp_difference/mean": 0.013966895639896393, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 3303.0, "completions/mean_length": 800.609375, "completions/mean_terminated_length": 733.9524536132812, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.17556585371494293, "epoch": 0.6654867256637168, "frac_reward_zero_std": 0.5, "grad_norm": 0.6807006851941164, "kl": 0.009180280379951, "learning_rate": 9.868176523207463e-07, "loss": 0.077, "num_tokens": 15371285.0, "reward": -0.03125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996539950370789, "sampling/importance_sampling_ratio/min": 0.36981886625289917, "sampling/sampling_logp_difference/max": 1.3775503635406494, "sampling/sampling_logp_difference/mean": 0.010441625490784645, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 304.5625, "completions/mean_terminated_length": 304.5625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.17261387407779694, "epoch": 0.6672566371681415, "frac_reward_zero_std": 0.75, "grad_norm": 1.2486949567430803, "kl": 0.015468006953597069, "learning_rate": 9.86640909094902e-07, "loss": 0.0891, "num_tokens": 15399769.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8626768589019775, "sampling/importance_sampling_ratio/mean": 0.999647855758667, "sampling/importance_sampling_ratio/min": 0.4437774121761322, "sampling/sampling_logp_difference/max": 0.8124321699142456, "sampling/sampling_logp_difference/mean": 0.012622671201825142, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 290.34375, "completions/mean_terminated_length": 290.34375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.18054424226284027, "epoch": 0.6690265486725664, "frac_reward_zero_std": 1.0, "grad_norm": 0.04150537125946567, "kl": 0.01121203601360321, "learning_rate": 9.864630049381424e-07, "loss": 0.0001, "num_tokens": 15428511.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5821890830993652, "sampling/importance_sampling_ratio/mean": 0.9995861649513245, "sampling/importance_sampling_ratio/min": 0.3619888424873352, "sampling/sampling_logp_difference/max": 1.0161418914794922, "sampling/sampling_logp_difference/mean": 0.012018736451864243, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3234.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 443.671875, "completions/mean_terminated_length": 443.671875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2470538169145584, "epoch": 0.6707964601769911, "frac_reward_zero_std": 0.5, "grad_norm": 1.7483173601362934, "kl": 0.010239788331091404, "learning_rate": 9.862839402748753e-07, "loss": 0.3392, "num_tokens": 15467754.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000256299972534, "sampling/importance_sampling_ratio/min": 0.3528807759284973, "sampling/sampling_logp_difference/max": 1.0416250228881836, "sampling/sampling_logp_difference/mean": 0.014839019626379013, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 373.796875, "completions/mean_terminated_length": 373.796875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.18259303271770477, "epoch": 0.672566371681416, "frac_reward_zero_std": 0.5, "grad_norm": 1.3149904667553791, "kl": 0.014294959604740143, "learning_rate": 9.861037155322776e-07, "loss": -0.0423, "num_tokens": 15502125.0, "reward": -0.25, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.915751576423645, "sampling/importance_sampling_ratio/mean": 0.9997811317443848, "sampling/importance_sampling_ratio/min": 0.459757000207901, "sampling/sampling_logp_difference/max": 0.7770571708679199, "sampling/sampling_logp_difference/mean": 0.01282742153853178, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 410.015625, "completions/mean_terminated_length": 410.015625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.21277083456516266, "epoch": 0.6743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 1.3968895595780355, "kl": 0.011500532738864422, "learning_rate": 9.859223311402936e-07, "loss": 0.049, "num_tokens": 15539150.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.910426378250122, "sampling/importance_sampling_ratio/mean": 1.000016689300537, "sampling/importance_sampling_ratio/min": 0.5484616160392761, "sampling/sampling_logp_difference/max": 0.6473264694213867, "sampling/sampling_logp_difference/mean": 0.01337532140314579, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 202.515625, "completions/mean_terminated_length": 202.515625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2012748122215271, "epoch": 0.6761061946902654, "frac_reward_zero_std": 0.5, "grad_norm": 2.546414375361651, "kl": 0.012247953563928604, "learning_rate": 9.85739787531634e-07, "loss": 0.0707, "num_tokens": 15562879.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.755321741104126, "sampling/importance_sampling_ratio/mean": 0.9993323087692261, "sampling/importance_sampling_ratio/min": 0.16336990892887115, "sampling/sampling_logp_difference/max": 1.8117382526397705, "sampling/sampling_logp_difference/mean": 0.01337825134396553, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 430.828125, "completions/mean_terminated_length": 430.828125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.21200969815254211, "epoch": 0.6778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 0.8332265757566012, "kl": 0.01017652079463005, "learning_rate": 9.85556085141775e-07, "loss": -0.0313, "num_tokens": 15600788.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6951402425765991, "sampling/importance_sampling_ratio/mean": 1.000274896621704, "sampling/importance_sampling_ratio/min": 0.07603034377098083, "sampling/sampling_logp_difference/max": 2.576622724533081, "sampling/sampling_logp_difference/mean": 0.012311737984418869, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4662.0, "completions/max_terminated_length": 4662.0, "completions/mean_length": 712.546875, "completions/mean_terminated_length": 712.546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.14502698183059692, "epoch": 0.679646017699115, "frac_reward_zero_std": 1.0, "grad_norm": 0.027876288304114063, "kl": 0.012645444832742214, "learning_rate": 9.853712244089572e-07, "loss": 0.0001, "num_tokens": 15657447.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006616115570068, "sampling/importance_sampling_ratio/min": 0.4300112724304199, "sampling/sampling_logp_difference/max": 0.8439438343048096, "sampling/sampling_logp_difference/mean": 0.01056058332324028, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 282.40625, "completions/mean_terminated_length": 282.40625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.18914933502674103, "epoch": 0.6814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 1.267563552208288, "kl": 0.015264193527400494, "learning_rate": 9.851852057741844e-07, "loss": -0.0322, "num_tokens": 15688321.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.755321741104126, "sampling/importance_sampling_ratio/mean": 1.0002809762954712, "sampling/importance_sampling_ratio/min": 0.3985530436038971, "sampling/sampling_logp_difference/max": 0.919914722442627, "sampling/sampling_logp_difference/mean": 0.01284891739487648, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4997.0, "completions/max_terminated_length": 4997.0, "completions/mean_length": 687.84375, "completions/mean_terminated_length": 687.84375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.16814549267292023, "epoch": 0.6831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 0.819117943689531, "kl": 0.00946043711155653, "learning_rate": 9.849980296812231e-07, "loss": 0.1985, "num_tokens": 15743047.0, "reward": 0.03125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002700090408325, "sampling/importance_sampling_ratio/min": 0.3642270565032959, "sampling/sampling_logp_difference/max": 1.6484861373901367, "sampling/sampling_logp_difference/mean": 0.010347165167331696, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 650.15625, "completions/mean_terminated_length": 650.15625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.23070669174194336, "epoch": 0.6849557522123894, "frac_reward_zero_std": 0.25, "grad_norm": 1.3896187369442379, "kl": 0.012162264436483383, "learning_rate": 9.848096965766002e-07, "loss": 0.1018, "num_tokens": 15795985.0, "reward": 0.53125, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000406265258789, "sampling/importance_sampling_ratio/min": 0.4234336018562317, "sampling/sampling_logp_difference/max": 2.120561361312866, "sampling/sampling_logp_difference/mean": 0.012703136540949345, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2636.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 358.109375, "completions/mean_terminated_length": 358.109375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.20230448246002197, "epoch": 0.6867256637168142, "frac_reward_zero_std": 0.75, "grad_norm": 0.9589898195132823, "kl": 0.013702154159545898, "learning_rate": 9.846202069096038e-07, "loss": -0.0559, "num_tokens": 15830648.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000317096710205, "sampling/importance_sampling_ratio/min": 0.5097154974937439, "sampling/sampling_logp_difference/max": 0.9375678300857544, "sampling/sampling_logp_difference/mean": 0.012061585672199726, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3523.0, "completions/max_terminated_length": 3523.0, "completions/mean_length": 370.015625, "completions/mean_terminated_length": 370.015625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.30893707275390625, "epoch": 0.6884955752212389, "frac_reward_zero_std": 0.5, "grad_norm": 1.3969593855222555, "kl": 0.013345591723918915, "learning_rate": 9.844295611322803e-07, "loss": -0.0911, "num_tokens": 15868889.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000529289245605, "sampling/importance_sampling_ratio/min": 0.47059792280197144, "sampling/sampling_logp_difference/max": 1.129061222076416, "sampling/sampling_logp_difference/mean": 0.015513567253947258, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 560.921875, "completions/mean_terminated_length": 560.921875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2242632359266281, "epoch": 0.6902654867256637, "frac_reward_zero_std": 0.25, "grad_norm": 1.3374216448997203, "kl": 0.011544520035386086, "learning_rate": 9.842377596994344e-07, "loss": 0.0789, "num_tokens": 15917348.0, "reward": 0.8125, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.713798999786377, "sampling/importance_sampling_ratio/mean": 0.9998651146888733, "sampling/importance_sampling_ratio/min": 0.3722725808620453, "sampling/sampling_logp_difference/max": 0.9881290197372437, "sampling/sampling_logp_difference/mean": 0.013481417670845985, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 274.40625, "completions/mean_terminated_length": 274.40625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.21443676948547363, "epoch": 0.6920353982300885, "frac_reward_zero_std": 0.25, "grad_norm": 2.630487226522862, "kl": 0.02395898476243019, "learning_rate": 9.84044803068628e-07, "loss": 0.0789, "num_tokens": 15945646.0, "reward": 0.78125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6892169713974, "sampling/importance_sampling_ratio/mean": 0.9997508525848389, "sampling/importance_sampling_ratio/min": 0.2846265733242035, "sampling/sampling_logp_difference/max": 1.2565772533416748, "sampling/sampling_logp_difference/mean": 0.015896450728178024, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2035.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 505.71875, "completions/mean_terminated_length": 505.71875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.28609830141067505, "epoch": 0.6938053097345133, "frac_reward_zero_std": 0.5, "grad_norm": 1.0282365593921183, "kl": 0.014391066506505013, "learning_rate": 9.838506917001784e-07, "loss": -0.0757, "num_tokens": 15988428.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002038478851318, "sampling/importance_sampling_ratio/min": 0.3368488848209381, "sampling/sampling_logp_difference/max": 1.088120937347412, "sampling/sampling_logp_difference/mean": 0.015157629735767841, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 299.296875, "completions/mean_terminated_length": 299.296875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.16830375790596008, "epoch": 0.695575221238938, "frac_reward_zero_std": 1.0, "grad_norm": 0.09441871773531267, "kl": 0.01799730211496353, "learning_rate": 9.836554260571577e-07, "loss": 0.0002, "num_tokens": 16017935.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.627285361289978, "sampling/importance_sampling_ratio/mean": 0.9996742606163025, "sampling/importance_sampling_ratio/min": 0.41789835691452026, "sampling/sampling_logp_difference/max": 0.8725171089172363, "sampling/sampling_logp_difference/mean": 0.010983328334987164, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 277.953125, "completions/mean_terminated_length": 277.953125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.16114415228366852, "epoch": 0.6973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 1.3159064586046563, "kl": 0.02458821050822735, "learning_rate": 9.834590066053917e-07, "loss": 0.0145, "num_tokens": 16046764.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000511407852173, "sampling/importance_sampling_ratio/min": 0.5287020802497864, "sampling/sampling_logp_difference/max": 0.726496696472168, "sampling/sampling_logp_difference/mean": 0.011962441727519035, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 317.359375, "completions/mean_terminated_length": 317.359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.22434422373771667, "epoch": 0.6991150442477876, "frac_reward_zero_std": 0.5, "grad_norm": 1.6684125354156674, "kl": 0.020960889756679535, "learning_rate": 9.832614338134595e-07, "loss": 0.0779, "num_tokens": 16075907.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.7171355485916138, "sampling/importance_sampling_ratio/mean": 0.9998688101768494, "sampling/importance_sampling_ratio/min": 0.4562278687953949, "sampling/sampling_logp_difference/max": 0.7847628593444824, "sampling/sampling_logp_difference/mean": 0.015385664999485016, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 165.859375, "completions/mean_terminated_length": 165.859375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.10077399015426636, "epoch": 0.7008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.10457712170976724, "kl": 0.02222380042076111, "learning_rate": 9.8306270815269e-07, "loss": 0.0002, "num_tokens": 16096426.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6574574708938599, "sampling/importance_sampling_ratio/mean": 1.0001741647720337, "sampling/importance_sampling_ratio/min": 0.30977651476860046, "sampling/sampling_logp_difference/max": 1.17190420627594, "sampling/sampling_logp_difference/mean": 0.010229747742414474, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 176.390625, "completions/mean_terminated_length": 176.390625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.21205547451972961, "epoch": 0.7026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 1.6541744314271525, "kl": 0.031553469598293304, "learning_rate": 9.828628300971638e-07, "loss": -0.0127, "num_tokens": 16118547.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.857185959815979, "sampling/importance_sampling_ratio/mean": 0.9998997449874878, "sampling/importance_sampling_ratio/min": 0.47603273391723633, "sampling/sampling_logp_difference/max": 0.7422686219215393, "sampling/sampling_logp_difference/mean": 0.01476391963660717, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 217.203125, "completions/mean_terminated_length": 217.203125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.25072354078292847, "epoch": 0.7044247787610619, "frac_reward_zero_std": 0.75, "grad_norm": 2.0187287910593334, "kl": 0.021776171401143074, "learning_rate": 9.826618001237099e-07, "loss": -0.0736, "num_tokens": 16143296.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998471736907959, "sampling/importance_sampling_ratio/min": 0.3793991506099701, "sampling/sampling_logp_difference/max": 0.9712977409362793, "sampling/sampling_logp_difference/mean": 0.017043638974428177, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 277.65625, "completions/mean_terminated_length": 277.65625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.19806835055351257, "epoch": 0.7061946902654868, "frac_reward_zero_std": 0.5, "grad_norm": 1.8397901163934267, "kl": 0.017188621684908867, "learning_rate": 9.82459618711906e-07, "loss": 0.0054, "num_tokens": 16171978.0, "reward": 0.5625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9543622732162476, "sampling/importance_sampling_ratio/mean": 0.9999783635139465, "sampling/importance_sampling_ratio/min": 0.5723154544830322, "sampling/sampling_logp_difference/max": 0.6700639724731445, "sampling/sampling_logp_difference/mean": 0.012082374654710293, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 178.390625, "completions/mean_terminated_length": 178.390625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.19672617316246033, "epoch": 0.7079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.6628841955141533, "kl": 0.01761125773191452, "learning_rate": 9.822562863440755e-07, "loss": 0.016, "num_tokens": 16197635.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007708072662354, "sampling/importance_sampling_ratio/min": 0.5239951610565186, "sampling/sampling_logp_difference/max": 0.8979504108428955, "sampling/sampling_logp_difference/mean": 0.013609852641820908, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 351.109375, "completions/mean_terminated_length": 351.109375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.2183641642332077, "epoch": 0.7097345132743362, "frac_reward_zero_std": 0.75, "grad_norm": 1.0015984577332653, "kl": 0.016200590878725052, "learning_rate": 9.820518035052889e-07, "loss": 0.0087, "num_tokens": 16230954.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995470643043518, "sampling/importance_sampling_ratio/min": 0.45822873711586, "sampling/sampling_logp_difference/max": 0.8086236715316772, "sampling/sampling_logp_difference/mean": 0.013386225327849388, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 240.34375, "completions/mean_terminated_length": 240.34375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.14552384614944458, "epoch": 0.7115044247787611, "frac_reward_zero_std": 0.75, "grad_norm": 1.421226156922292, "kl": 0.01397843286395073, "learning_rate": 9.818461706833602e-07, "loss": -0.0031, "num_tokens": 16255840.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000092625617981, "sampling/importance_sampling_ratio/min": 0.38532912731170654, "sampling/sampling_logp_difference/max": 0.9536573886871338, "sampling/sampling_logp_difference/mean": 0.010881468653678894, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 359.609375, "completions/mean_terminated_length": 359.609375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2025105357170105, "epoch": 0.7132743362831858, "frac_reward_zero_std": 0.75, "grad_norm": 0.9600712112148655, "kl": 0.014710599556565285, "learning_rate": 9.816393883688475e-07, "loss": -0.0748, "num_tokens": 16289479.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8008395433425903, "sampling/importance_sampling_ratio/mean": 1.0002391338348389, "sampling/importance_sampling_ratio/min": 0.35800760984420776, "sampling/sampling_logp_difference/max": 1.0272010564804077, "sampling/sampling_logp_difference/mean": 0.013284672051668167, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 165.296875, "completions/mean_terminated_length": 165.296875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.21070364117622375, "epoch": 0.7150442477876107, "frac_reward_zero_std": 0.75, "grad_norm": 2.195679156864984, "kl": 0.021444156765937805, "learning_rate": 9.814314570550505e-07, "loss": -0.0043, "num_tokens": 16315594.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003331899642944, "sampling/importance_sampling_ratio/min": 0.3985528349876404, "sampling/sampling_logp_difference/max": 0.9199151992797852, "sampling/sampling_logp_difference/mean": 0.01587303727865219, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 360.875, "completions/mean_terminated_length": 360.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.31480875611305237, "epoch": 0.7168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.05872088387382137, "kl": 0.015601933933794498, "learning_rate": 9.812223772380105e-07, "loss": 0.0001, "num_tokens": 16353938.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5940091609954834, "sampling/importance_sampling_ratio/mean": 0.9994897842407227, "sampling/importance_sampling_ratio/min": 0.3239595890045166, "sampling/sampling_logp_difference/max": 1.127136468887329, "sampling/sampling_logp_difference/mean": 0.017481831833720207, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.16573050618171692, "epoch": 0.7185840707964601, "frac_reward_zero_std": 0.75, "grad_norm": 1.7445620014439338, "kl": 0.030939366668462753, "learning_rate": 9.810121494165087e-07, "loss": 0.1416, "num_tokens": 16382066.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004956722259521, "sampling/importance_sampling_ratio/min": 0.6063432693481445, "sampling/sampling_logp_difference/max": 0.8068802356719971, "sampling/sampling_logp_difference/mean": 0.01262989453971386, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 463.765625, "completions/mean_terminated_length": 463.765625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.21868783235549927, "epoch": 0.720353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 1.3873963334588708, "kl": 0.012951729819178581, "learning_rate": 9.808007740920645e-07, "loss": -0.0227, "num_tokens": 16421939.0, "reward": -0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005403757095337, "sampling/importance_sampling_ratio/min": 0.3409053087234497, "sampling/sampling_logp_difference/max": 1.0761505365371704, "sampling/sampling_logp_difference/mean": 0.012595891952514648, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 272.328125, "completions/mean_terminated_length": 272.328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1695495992898941, "epoch": 0.7221238938053097, "frac_reward_zero_std": 0.5, "grad_norm": 1.7224004313151264, "kl": 0.011198390275239944, "learning_rate": 9.80588251768935e-07, "loss": 0.0428, "num_tokens": 16450056.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004069805145264, "sampling/importance_sampling_ratio/min": 0.2243276685476303, "sampling/sampling_logp_difference/max": 1.49464750289917, "sampling/sampling_logp_difference/mean": 0.011562224477529526, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 377.34375, "completions/mean_terminated_length": 303.96826171875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24059130251407623, "epoch": 0.7238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.028185610419519908, "kl": 0.01371281873434782, "learning_rate": 9.803745829541137e-07, "loss": 0.0001, "num_tokens": 16486446.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000351905822754, "sampling/importance_sampling_ratio/min": 0.3853738009929657, "sampling/sampling_logp_difference/max": 1.1979022026062012, "sampling/sampling_logp_difference/mean": 0.013200994580984116, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 373.8125, "completions/mean_terminated_length": 373.8125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.14997084438800812, "epoch": 0.7256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.1219706304142658, "kl": 0.014573777094483376, "learning_rate": 9.801597681573289e-07, "loss": 0.0541, "num_tokens": 16519842.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997867345809937, "sampling/importance_sampling_ratio/min": 0.3184911012649536, "sampling/sampling_logp_difference/max": 1.1441607475280762, "sampling/sampling_logp_difference/mean": 0.011845771223306656, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 301.59375, "completions/mean_terminated_length": 301.59375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.1510602831840515, "epoch": 0.727433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 1.3086002352845445, "kl": 0.015622087754309177, "learning_rate": 9.799438078910432e-07, "loss": -0.0451, "num_tokens": 16548680.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6840492486953735, "sampling/importance_sampling_ratio/mean": 1.0000107288360596, "sampling/importance_sampling_ratio/min": 0.34171298146247864, "sampling/sampling_logp_difference/max": 1.0737841129302979, "sampling/sampling_logp_difference/mean": 0.011462142691016197, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 407.6875, "completions/mean_terminated_length": 407.6875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2473008930683136, "epoch": 0.7292035398230089, "frac_reward_zero_std": 0.75, "grad_norm": 0.8340980503181993, "kl": 0.010714706964790821, "learning_rate": 9.797267026704514e-07, "loss": -0.0048, "num_tokens": 16587044.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999884366989136, "sampling/importance_sampling_ratio/min": 0.5020687580108643, "sampling/sampling_logp_difference/max": 0.9295365810394287, "sampling/sampling_logp_difference/mean": 0.01348821073770523, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 257.609375, "completions/mean_terminated_length": 257.609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.20970673859119415, "epoch": 0.7309734513274336, "frac_reward_zero_std": 1.0, "grad_norm": 0.05568744728047003, "kl": 0.017753563821315765, "learning_rate": 9.7950845301348e-07, "loss": 0.0002, "num_tokens": 16616315.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000464677810669, "sampling/importance_sampling_ratio/min": 0.13142473995685577, "sampling/sampling_logp_difference/max": 2.0293209552764893, "sampling/sampling_logp_difference/mean": 0.013105915859341621, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3391.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 551.25, "completions/mean_terminated_length": 551.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3271552324295044, "epoch": 0.7327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 0.8767594028779837, "kl": 0.013495013117790222, "learning_rate": 9.792890594407855e-07, "loss": 0.0026, "num_tokens": 16669419.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6690562963485718, "sampling/importance_sampling_ratio/mean": 1.0001423358917236, "sampling/importance_sampling_ratio/min": 0.5452139377593994, "sampling/sampling_logp_difference/max": 0.6065769195556641, "sampling/sampling_logp_difference/mean": 0.015598084777593613, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2221.0, "completions/max_terminated_length": 2221.0, "completions/mean_length": 410.046875, "completions/mean_terminated_length": 410.046875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.2771804928779602, "epoch": 0.7345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 0.9268310137637527, "kl": 0.01602480560541153, "learning_rate": 9.790685224757532e-07, "loss": -0.0648, "num_tokens": 16712398.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999347925186157, "sampling/importance_sampling_ratio/min": 0.48236799240112305, "sampling/sampling_logp_difference/max": 0.7290480136871338, "sampling/sampling_logp_difference/mean": 0.014763117767870426, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 618.9375, "completions/mean_terminated_length": 618.9375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2050457000732422, "epoch": 0.736283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 0.6048331277141861, "kl": 0.014558937400579453, "learning_rate": 9.788468426444967e-07, "loss": 0.0317, "num_tokens": 16763882.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004180669784546, "sampling/importance_sampling_ratio/min": 0.3941483795642853, "sampling/sampling_logp_difference/max": 0.931027889251709, "sampling/sampling_logp_difference/mean": 0.012745855376124382, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 412.9375, "completions/mean_terminated_length": 412.9375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22474998235702515, "epoch": 0.7380530973451327, "frac_reward_zero_std": 0.75, "grad_norm": 0.8115612406637343, "kl": 0.015401540324091911, "learning_rate": 9.786240204758552e-07, "loss": -0.0076, "num_tokens": 16802422.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995291829109192, "sampling/importance_sampling_ratio/min": 0.42388641834259033, "sampling/sampling_logp_difference/max": 0.8582897186279297, "sampling/sampling_logp_difference/mean": 0.013247748836874962, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3226742744445801, "epoch": 0.7398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.12398395012847772, "kl": 0.024023059755563736, "learning_rate": 9.784000565013933e-07, "loss": 0.0002, "num_tokens": 16831286.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.746854543685913, "sampling/importance_sampling_ratio/mean": 0.9993400573730469, "sampling/importance_sampling_ratio/min": 0.3789941966533661, "sampling/sampling_logp_difference/max": 0.9702343940734863, "sampling/sampling_logp_difference/mean": 0.018782150000333786, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 402.84375, "completions/mean_terminated_length": 402.84375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.25371700525283813, "epoch": 0.7415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.7969442903422961, "kl": 0.01650291681289673, "learning_rate": 9.781749512553998e-07, "loss": -0.0363, "num_tokens": 16870060.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999105930328369, "sampling/importance_sampling_ratio/min": 0.4762580990791321, "sampling/sampling_logp_difference/max": 0.8771946430206299, "sampling/sampling_logp_difference/mean": 0.014303840696811676, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 362.015625, "completions/mean_terminated_length": 362.015625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.1400907039642334, "epoch": 0.7433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 0.9586514826989071, "kl": 0.018355652689933777, "learning_rate": 9.779487052748863e-07, "loss": -0.0101, "num_tokens": 16902749.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999150633811951, "sampling/importance_sampling_ratio/min": 0.37231311202049255, "sampling/sampling_logp_difference/max": 0.9880200624465942, "sampling/sampling_logp_difference/mean": 0.010286588221788406, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 189.796875, "completions/mean_terminated_length": 189.796875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.14167192578315735, "epoch": 0.7451327433628319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0941625253445374, "kl": 0.02602040208876133, "learning_rate": 9.777213190995847e-07, "loss": 0.0002, "num_tokens": 16924336.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6988407373428345, "sampling/importance_sampling_ratio/mean": 0.9998562335968018, "sampling/importance_sampling_ratio/min": 0.49110618233680725, "sampling/sampling_logp_difference/max": 0.711094856262207, "sampling/sampling_logp_difference/mean": 0.013465171679854393, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 260.46875, "completions/mean_terminated_length": 260.46875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.13782721757888794, "epoch": 0.7469026548672566, "frac_reward_zero_std": 0.75, "grad_norm": 1.4626125356704403, "kl": 0.015280244871973991, "learning_rate": 9.774927932719482e-07, "loss": 0.0186, "num_tokens": 16950590.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003206729888916, "sampling/importance_sampling_ratio/min": 0.48238638043403625, "sampling/sampling_logp_difference/max": 0.8145129680633545, "sampling/sampling_logp_difference/mean": 0.010800705291330814, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2206.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 479.4375, "completions/mean_terminated_length": 479.4375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.14082644879817963, "epoch": 0.7486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 0.9404976235526497, "kl": 0.01646381989121437, "learning_rate": 9.77263128337148e-07, "loss": 0.0403, "num_tokens": 16990282.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999949038028717, "sampling/importance_sampling_ratio/min": 0.07699912041425705, "sampling/sampling_logp_difference/max": 2.5639612674713135, "sampling/sampling_logp_difference/mean": 0.011669103056192398, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 3443.0, "completions/mean_length": 556.8125, "completions/mean_terminated_length": 486.2857360839844, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.14109674096107483, "epoch": 0.7504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 0.7892448323398299, "kl": 0.01084822416305542, "learning_rate": 9.770323248430727e-07, "loss": -0.0123, "num_tokens": 17035502.0, "reward": 0.46875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9451937675476074, "sampling/importance_sampling_ratio/mean": 0.9997051954269409, "sampling/importance_sampling_ratio/min": 0.28236865997314453, "sampling/sampling_logp_difference/max": 1.264541745185852, "sampling/sampling_logp_difference/mean": 0.008924733847379684, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 360.53125, "completions/mean_terminated_length": 360.53125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.1576463282108307, "epoch": 0.7522123893805309, "frac_reward_zero_std": 0.25, "grad_norm": 1.6149629624758066, "kl": 0.01197238638997078, "learning_rate": 9.768003833403276e-07, "loss": 0.0717, "num_tokens": 17069280.0, "reward": 0.75, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999859929084778, "sampling/importance_sampling_ratio/min": 0.13127148151397705, "sampling/sampling_logp_difference/max": 2.0304877758026123, "sampling/sampling_logp_difference/mean": 0.011847829446196556, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 733.65625, "completions/mean_terminated_length": 733.65625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.276885449886322, "epoch": 0.7539823008849558, "frac_reward_zero_std": 0.25, "grad_norm": 0.9572840856291367, "kl": 0.015323720872402191, "learning_rate": 9.765673043822324e-07, "loss": -0.0422, "num_tokens": 17130474.0, "reward": 0.78125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7562862634658813, "sampling/importance_sampling_ratio/mean": 1.000009536743164, "sampling/importance_sampling_ratio/min": 0.48347559571266174, "sampling/sampling_logp_difference/max": 0.7267544269561768, "sampling/sampling_logp_difference/mean": 0.013221499510109425, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 197.578125, "completions/mean_terminated_length": 197.578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.12849444150924683, "epoch": 0.7557522123893805, "frac_reward_zero_std": 0.75, "grad_norm": 1.4368642250254873, "kl": 0.02044854499399662, "learning_rate": 9.763330885248204e-07, "loss": -0.0395, "num_tokens": 17152319.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6387522220611572, "sampling/importance_sampling_ratio/mean": 0.9999079704284668, "sampling/importance_sampling_ratio/min": 0.4967695474624634, "sampling/sampling_logp_difference/max": 0.6996290683746338, "sampling/sampling_logp_difference/mean": 0.011263299733400345, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 3625.0, "completions/mean_length": 876.640625, "completions/mean_terminated_length": 811.1905517578125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.24497635662555695, "epoch": 0.7575221238938054, "frac_reward_zero_std": 0.5, "grad_norm": 0.6667530678512397, "kl": 0.017537204548716545, "learning_rate": 9.760977363268373e-07, "loss": -0.0001, "num_tokens": 17219704.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.8824337720870972, "sampling/importance_sampling_ratio/mean": 0.9996764063835144, "sampling/importance_sampling_ratio/min": 0.017040107399225235, "sampling/sampling_logp_difference/max": 4.072185516357422, "sampling/sampling_logp_difference/mean": 0.01255839690566063, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3436.0, "completions/max_terminated_length": 3436.0, "completions/mean_length": 733.8125, "completions/mean_terminated_length": 733.8125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.19446253776550293, "epoch": 0.7592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 0.5171110355592028, "kl": 0.014578867703676224, "learning_rate": 9.758612483497394e-07, "loss": 0.093, "num_tokens": 17277756.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998770952224731, "sampling/importance_sampling_ratio/min": 0.2896878719329834, "sampling/sampling_logp_difference/max": 1.2389512062072754, "sampling/sampling_logp_difference/mean": 0.011195329017937183, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 476.921875, "completions/mean_terminated_length": 476.921875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.30603867769241333, "epoch": 0.7610619469026548, "frac_reward_zero_std": 0.5, "grad_norm": 1.0974191646167424, "kl": 0.01837025210261345, "learning_rate": 9.756236251576924e-07, "loss": -0.1056, "num_tokens": 17321079.0, "reward": -0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9441672563552856, "sampling/importance_sampling_ratio/mean": 0.9999624490737915, "sampling/importance_sampling_ratio/min": 0.24628712236881256, "sampling/sampling_logp_difference/max": 1.4012572765350342, "sampling/sampling_logp_difference/mean": 0.013847127556800842, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 402.015625, "completions/mean_terminated_length": 402.015625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.15806062519550323, "epoch": 0.7628318584070797, "frac_reward_zero_std": 0.75, "grad_norm": 0.7259208077579834, "kl": 0.017834410071372986, "learning_rate": 9.753848673175707e-07, "loss": 0.0007, "num_tokens": 17356280.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.904421329498291, "sampling/importance_sampling_ratio/mean": 0.9999153017997742, "sampling/importance_sampling_ratio/min": 0.25980913639068604, "sampling/sampling_logp_difference/max": 1.3478080034255981, "sampling/sampling_logp_difference/mean": 0.01067222747951746, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 3935.0, "completions/mean_length": 598.15625, "completions/mean_terminated_length": 528.2857666015625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3223878741264343, "epoch": 0.7646017699115044, "frac_reward_zero_std": 0.75, "grad_norm": 0.6838776618611873, "kl": 0.019372940063476562, "learning_rate": 9.751449753989546e-07, "loss": 0.1081, "num_tokens": 17407378.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999616146087646, "sampling/importance_sampling_ratio/min": 0.26672056317329407, "sampling/sampling_logp_difference/max": 1.3215537071228027, "sampling/sampling_logp_difference/mean": 0.01632527820765972, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 553.046875, "completions/mean_terminated_length": 553.046875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.2686847448348999, "epoch": 0.7663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 0.8901191922181324, "kl": 0.011914161965250969, "learning_rate": 9.74903949974131e-07, "loss": 0.0106, "num_tokens": 17454309.0, "reward": -0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999943733215332, "sampling/importance_sampling_ratio/min": 0.432391881942749, "sampling/sampling_logp_difference/max": 0.8384230136871338, "sampling/sampling_logp_difference/mean": 0.013745191507041454, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2481.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 461.0, "completions/mean_terminated_length": 461.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.261181116104126, "epoch": 0.768141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.03992125208774775, "kl": 0.01855924353003502, "learning_rate": 9.746617916180905e-07, "loss": 0.0001, "num_tokens": 17495013.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003955364227295, "sampling/importance_sampling_ratio/min": 0.4374275803565979, "sampling/sampling_logp_difference/max": 1.2297992706298828, "sampling/sampling_logp_difference/mean": 0.0141875259578228, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2526.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 300.984375, "completions/mean_terminated_length": 300.984375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.175477996468544, "epoch": 0.7699115044247787, "frac_reward_zero_std": 0.75, "grad_norm": 1.5217570498346458, "kl": 0.02007683366537094, "learning_rate": 9.744185009085256e-07, "loss": 0.0026, "num_tokens": 17524084.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8454229831695557, "sampling/importance_sampling_ratio/mean": 0.9999710321426392, "sampling/importance_sampling_ratio/min": 0.28705891966819763, "sampling/sampling_logp_difference/max": 1.248067855834961, "sampling/sampling_logp_difference/mean": 0.012873979285359383, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2035.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 579.109375, "completions/mean_terminated_length": 579.109375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2825455665588379, "epoch": 0.7716814159292036, "frac_reward_zero_std": 0.25, "grad_norm": 1.369484679665006, "kl": 0.01618799939751625, "learning_rate": 9.741740784258311e-07, "loss": -0.0164, "num_tokens": 17573179.0, "reward": 0.21875, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000884532928467, "sampling/importance_sampling_ratio/min": 0.16337034106254578, "sampling/sampling_logp_difference/max": 1.8117356300354004, "sampling/sampling_logp_difference/mean": 0.014694714918732643, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 225.765625, "completions/mean_terminated_length": 225.765625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.15016265213489532, "epoch": 0.7734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.08972287632352739, "kl": 0.025470344349741936, "learning_rate": 9.739285247531017e-07, "loss": 0.0002, "num_tokens": 17598460.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002248287200928, "sampling/importance_sampling_ratio/min": 0.2886965274810791, "sampling/sampling_logp_difference/max": 1.2423791885375977, "sampling/sampling_logp_difference/mean": 0.011946619488298893, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 297.578125, "completions/mean_terminated_length": 297.578125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.19839031994342804, "epoch": 0.7752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 0.8397310686633933, "kl": 0.020134732127189636, "learning_rate": 9.736818404761302e-07, "loss": -0.0313, "num_tokens": 17628433.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996548295021057, "sampling/importance_sampling_ratio/min": 0.06711579859256744, "sampling/sampling_logp_difference/max": 2.701335906982422, "sampling/sampling_logp_difference/mean": 0.012997930869460106, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3671.0, "completions/max_terminated_length": 3671.0, "completions/mean_length": 505.359375, "completions/mean_terminated_length": 505.359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.14583268761634827, "epoch": 0.7769911504424779, "frac_reward_zero_std": 0.75, "grad_norm": 0.8506670978205387, "kl": 0.017080917954444885, "learning_rate": 9.734340261834066e-07, "loss": -0.0337, "num_tokens": 17671272.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8012324571609497, "sampling/importance_sampling_ratio/mean": 1.00022554397583, "sampling/importance_sampling_ratio/min": 0.5134781002998352, "sampling/sampling_logp_difference/max": 0.666547954082489, "sampling/sampling_logp_difference/mean": 0.010502178221940994, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 445.234375, "completions/mean_terminated_length": 445.234375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.18292751908302307, "epoch": 0.7787610619469026, "frac_reward_zero_std": 0.5, "grad_norm": 1.6435686789479544, "kl": 0.013177627697587013, "learning_rate": 9.73185082466117e-07, "loss": 0.3407, "num_tokens": 17708839.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.697757363319397, "sampling/importance_sampling_ratio/mean": 1.0002026557922363, "sampling/importance_sampling_ratio/min": 0.4863984286785126, "sampling/sampling_logp_difference/max": 0.7207272052764893, "sampling/sampling_logp_difference/mean": 0.011251913383603096, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5000.0, "completions/max_terminated_length": 5000.0, "completions/mean_length": 709.1875, "completions/mean_terminated_length": 709.1875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22844403982162476, "epoch": 0.7805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 0.6208180873802399, "kl": 0.014097400940954685, "learning_rate": 9.729350099181419e-07, "loss": 0.0515, "num_tokens": 17765667.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997754096984863, "sampling/importance_sampling_ratio/min": 0.4305298626422882, "sampling/sampling_logp_difference/max": 0.8772019147872925, "sampling/sampling_logp_difference/mean": 0.012171953916549683, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 382.46875, "completions/mean_terminated_length": 382.46875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.15237221121788025, "epoch": 0.7823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.05394738676826445, "kl": 0.017761651426553726, "learning_rate": 9.726838091360545e-07, "loss": 0.0002, "num_tokens": 17799985.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999023079872131, "sampling/importance_sampling_ratio/min": 0.47910451889038086, "sampling/sampling_logp_difference/max": 0.7700576782226562, "sampling/sampling_logp_difference/mean": 0.010599812492728233, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 533.1875, "completions/mean_terminated_length": 533.1875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.20799808204174042, "epoch": 0.784070796460177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0722922806701143, "kl": 0.015913112089037895, "learning_rate": 9.724314807191196e-07, "loss": 0.1909, "num_tokens": 17844605.0, "reward": 0.59375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003132820129395, "sampling/importance_sampling_ratio/min": 0.1518324315547943, "sampling/sampling_logp_difference/max": 1.8849778175354004, "sampling/sampling_logp_difference/mean": 0.012937791645526886, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 335.578125, "completions/mean_terminated_length": 335.578125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.1886487603187561, "epoch": 0.7858407079646018, "frac_reward_zero_std": 0.75, "grad_norm": 1.0993179291071986, "kl": 0.013678735122084618, "learning_rate": 9.721780252692917e-07, "loss": -0.1489, "num_tokens": 17876610.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000415325164795, "sampling/importance_sampling_ratio/min": 0.2781907618045807, "sampling/sampling_logp_difference/max": 1.2794482707977295, "sampling/sampling_logp_difference/mean": 0.01220756396651268, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 565.96875, "completions/mean_terminated_length": 565.96875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3471337556838989, "epoch": 0.7876106194690266, "frac_reward_zero_std": 0.75, "grad_norm": 0.7895915665755439, "kl": 0.02035902440547943, "learning_rate": 9.719234433912146e-07, "loss": 0.0696, "num_tokens": 17923344.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8623803853988647, "sampling/importance_sampling_ratio/mean": 0.9995938539505005, "sampling/importance_sampling_ratio/min": 0.439717173576355, "sampling/sampling_logp_difference/max": 0.8216235637664795, "sampling/sampling_logp_difference/mean": 0.015141831710934639, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 360.828125, "completions/mean_terminated_length": 360.828125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.133966326713562, "epoch": 0.7893805309734513, "frac_reward_zero_std": 0.5, "grad_norm": 1.2902108903808314, "kl": 0.020750487223267555, "learning_rate": 9.716677356922192e-07, "loss": -0.0618, "num_tokens": 17955749.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 0.9999099373817444, "sampling/importance_sampling_ratio/min": 0.3731269836425781, "sampling/sampling_logp_difference/max": 0.9858365058898926, "sampling/sampling_logp_difference/mean": 0.010008898563683033, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 275.828125, "completions/mean_terminated_length": 275.828125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.345963716506958, "epoch": 0.7911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.03392676216450743, "kl": 0.017359547317028046, "learning_rate": 9.714109027823216e-07, "loss": 0.0002, "num_tokens": 17984218.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.54673433303833, "sampling/importance_sampling_ratio/mean": 1.0003793239593506, "sampling/importance_sampling_ratio/min": 0.5583531856536865, "sampling/sampling_logp_difference/max": 0.5827635526657104, "sampling/sampling_logp_difference/mean": 0.016256965696811676, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 648.90625, "completions/mean_terminated_length": 648.90625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.17916353046894073, "epoch": 0.7929203539823009, "frac_reward_zero_std": 0.5, "grad_norm": 0.946707217564148, "kl": 0.011661479249596596, "learning_rate": 9.711529452742229e-07, "loss": 0.1519, "num_tokens": 18036644.0, "reward": 0.71875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994778037071228, "sampling/importance_sampling_ratio/min": 0.33245694637298584, "sampling/sampling_logp_difference/max": 1.1012449264526367, "sampling/sampling_logp_difference/mean": 0.010430063121020794, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 224.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.23862072825431824, "epoch": 0.7946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.07532883005478025, "kl": 0.023402484133839607, "learning_rate": 9.708938637833064e-07, "loss": 0.0002, "num_tokens": 18063628.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002508163452148, "sampling/importance_sampling_ratio/min": 0.5791581273078918, "sampling/sampling_logp_difference/max": 0.9711465835571289, "sampling/sampling_logp_difference/mean": 0.012772807851433754, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 527.84375, "completions/mean_terminated_length": 527.84375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.17505834996700287, "epoch": 0.7964601769911505, "frac_reward_zero_std": 0.75, "grad_norm": 0.8917235723301952, "kl": 0.01516976673156023, "learning_rate": 9.706336589276374e-07, "loss": 0.1428, "num_tokens": 18108978.0, "reward": -0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002479553222656, "sampling/importance_sampling_ratio/min": 0.4129166305065155, "sampling/sampling_logp_difference/max": 0.8845095634460449, "sampling/sampling_logp_difference/mean": 0.010961286723613739, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 298.0625, "completions/mean_terminated_length": 298.0625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.28212249279022217, "epoch": 0.7982300884955752, "frac_reward_zero_std": 0.75, "grad_norm": 1.5701772284452842, "kl": 0.016110116615891457, "learning_rate": 9.703723313279605e-07, "loss": 0.1628, "num_tokens": 18139350.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9625293016433716, "sampling/importance_sampling_ratio/mean": 0.9998340606689453, "sampling/importance_sampling_ratio/min": 0.48725593090057373, "sampling/sampling_logp_difference/max": 0.7189657688140869, "sampling/sampling_logp_difference/mean": 0.01432438101619482, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 315.96875, "completions/mean_terminated_length": 315.96875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2972191572189331, "epoch": 0.8, "frac_reward_zero_std": 0.75, "grad_norm": 1.2317379059491298, "kl": 0.02108466625213623, "learning_rate": 9.701098816076995e-07, "loss": 0.1005, "num_tokens": 18172532.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8576748371124268, "sampling/importance_sampling_ratio/mean": 1.0002095699310303, "sampling/importance_sampling_ratio/min": 0.47371339797973633, "sampling/sampling_logp_difference/max": 0.7471528053283691, "sampling/sampling_logp_difference/mean": 0.015264206565916538, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 410.046875, "completions/mean_terminated_length": 410.046875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.25031429529190063, "epoch": 0.8017699115044248, "frac_reward_zero_std": 0.75, "grad_norm": 0.7777847799398381, "kl": 0.018834521993994713, "learning_rate": 9.698463103929541e-07, "loss": 0.0307, "num_tokens": 18209447.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.91701340675354, "sampling/importance_sampling_ratio/mean": 1.0001859664916992, "sampling/importance_sampling_ratio/min": 0.5910211801528931, "sampling/sampling_logp_difference/max": 0.650768518447876, "sampling/sampling_logp_difference/mean": 0.012651356868445873, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3120.0, "completions/max_terminated_length": 3120.0, "completions/mean_length": 870.890625, "completions/mean_terminated_length": 870.890625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.16866523027420044, "epoch": 0.8035398230088495, "frac_reward_zero_std": 0.75, "grad_norm": 0.5150710213763287, "kl": 0.014326772652566433, "learning_rate": 9.695816183125003e-07, "loss": -0.037, "num_tokens": 18276272.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.9950884580612183, "sampling/importance_sampling_ratio/mean": 1.0000481605529785, "sampling/importance_sampling_ratio/min": 0.5807039141654968, "sampling/sampling_logp_difference/max": 0.6906883716583252, "sampling/sampling_logp_difference/mean": 0.009420694783329964, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 330.09375, "completions/mean_terminated_length": 330.09375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.1548394411802292, "epoch": 0.8053097345132744, "frac_reward_zero_std": 0.75, "grad_norm": 1.0237070438474407, "kl": 0.01795017346739769, "learning_rate": 9.693158059977877e-07, "loss": -0.0446, "num_tokens": 18308838.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999642372131348, "sampling/importance_sampling_ratio/min": 0.35350170731544495, "sampling/sampling_logp_difference/max": 1.0398669242858887, "sampling/sampling_logp_difference/mean": 0.010861951857805252, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 490.875, "completions/mean_terminated_length": 490.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.2727452516555786, "epoch": 0.8070796460176991, "frac_reward_zero_std": 0.0, "grad_norm": 1.7749496961253324, "kl": 0.016890378668904305, "learning_rate": 9.690488740829383e-07, "loss": 0.1735, "num_tokens": 18352030.0, "reward": 0.8125, "reward_std": 0.5915650129318237, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994820952415466, "sampling/importance_sampling_ratio/min": 0.5484714508056641, "sampling/sampling_logp_difference/max": 0.7138428688049316, "sampling/sampling_logp_difference/mean": 0.013885889202356339, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2078.0, "completions/max_terminated_length": 2078.0, "completions/mean_length": 499.578125, "completions/mean_terminated_length": 499.578125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.15201109647750854, "epoch": 0.8088495575221238, "frac_reward_zero_std": 0.75, "grad_norm": 0.771082649322385, "kl": 0.01977832242846489, "learning_rate": 9.68780823204745e-07, "loss": 0.0619, "num_tokens": 18394003.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9203048944473267, "sampling/importance_sampling_ratio/mean": 0.9999675750732422, "sampling/importance_sampling_ratio/min": 0.3767653703689575, "sampling/sampling_logp_difference/max": 0.9761326313018799, "sampling/sampling_logp_difference/mean": 0.010636713355779648, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 3532.0, "completions/mean_length": 607.421875, "completions/mean_terminated_length": 537.6984252929688, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.14685705304145813, "epoch": 0.8106194690265487, "frac_reward_zero_std": 0.75, "grad_norm": 0.7850048838727758, "kl": 0.016145143657922745, "learning_rate": 9.685116540026701e-07, "loss": 0.0746, "num_tokens": 18442894.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999462366104126, "sampling/importance_sampling_ratio/min": 0.27285879850387573, "sampling/sampling_logp_difference/max": 1.2988009452819824, "sampling/sampling_logp_difference/mean": 0.009459404274821281, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 363.5625, "completions/mean_terminated_length": 363.5625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2174839973449707, "epoch": 0.8123893805309734, "frac_reward_zero_std": 0.75, "grad_norm": 1.288587177694696, "kl": 0.01691630110144615, "learning_rate": 9.682413671188444e-07, "loss": 0.1429, "num_tokens": 18478674.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6207339763641357, "sampling/importance_sampling_ratio/mean": 0.9998880624771118, "sampling/importance_sampling_ratio/min": 0.5552295446395874, "sampling/sampling_logp_difference/max": 0.5883736610412598, "sampling/sampling_logp_difference/mean": 0.01261388324201107, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2034.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 587.28125, "completions/mean_terminated_length": 587.28125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.22939211130142212, "epoch": 0.8141592920353983, "frac_reward_zero_std": 0.25, "grad_norm": 1.049811180151682, "kl": 0.016246281564235687, "learning_rate": 9.679699631980637e-07, "loss": -0.0519, "num_tokens": 18525476.0, "reward": 0.4375, "reward_std": 0.7455304861068726, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8723053932189941, "sampling/importance_sampling_ratio/mean": 0.9995712041854858, "sampling/importance_sampling_ratio/min": 0.052514370530843735, "sampling/sampling_logp_difference/max": 2.9466683864593506, "sampling/sampling_logp_difference/mean": 0.01141293440014124, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 570.15625, "completions/mean_terminated_length": 570.15625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.18083524703979492, "epoch": 0.815929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.6484784102520742, "kl": 0.024283720180392265, "learning_rate": 9.6769744288779e-07, "loss": 0.1056, "num_tokens": 18571518.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6976978778839111, "sampling/importance_sampling_ratio/mean": 0.9996762871742249, "sampling/importance_sampling_ratio/min": 0.3962823748588562, "sampling/sampling_logp_difference/max": 0.9256283044815063, "sampling/sampling_logp_difference/mean": 0.01143775787204504, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 890.203125, "completions/mean_terminated_length": 890.203125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.3270219564437866, "epoch": 0.8176991150442477, "frac_reward_zero_std": 0.0, "grad_norm": 0.9540128828584432, "kl": 0.012963580898940563, "learning_rate": 9.674238068381478e-07, "loss": 0.1198, "num_tokens": 18638395.0, "reward": -0.1875, "reward_std": 0.8461624383926392, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.8809148073196411, "sampling/importance_sampling_ratio/mean": 1.000354528427124, "sampling/importance_sampling_ratio/min": 0.2624247372150421, "sampling/sampling_logp_difference/max": 1.3377909660339355, "sampling/sampling_logp_difference/mean": 0.013755712658166885, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 254.390625, "completions/mean_terminated_length": 254.390625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2501138150691986, "epoch": 0.8194690265486726, "frac_reward_zero_std": 1.0, "grad_norm": 0.05731526734443847, "kl": 0.02052447572350502, "learning_rate": 9.671490557019233e-07, "loss": 0.0002, "num_tokens": 18666516.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8179765939712524, "sampling/importance_sampling_ratio/mean": 1.0000762939453125, "sampling/importance_sampling_ratio/min": 0.38532912731170654, "sampling/sampling_logp_difference/max": 0.9536573886871338, "sampling/sampling_logp_difference/mean": 0.01461197528988123, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 258.203125, "completions/mean_terminated_length": 258.203125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.12889191508293152, "epoch": 0.8212389380530973, "frac_reward_zero_std": 0.75, "grad_norm": 0.997187106410792, "kl": 0.019986048340797424, "learning_rate": 9.668731901345632e-07, "loss": -0.0559, "num_tokens": 18692385.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9916552305221558, "sampling/importance_sampling_ratio/mean": 0.9997934103012085, "sampling/importance_sampling_ratio/min": 0.27287915349006653, "sampling/sampling_logp_difference/max": 1.298726201057434, "sampling/sampling_logp_difference/mean": 0.009985041804611683, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 493.640625, "completions/mean_terminated_length": 493.640625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2487362027168274, "epoch": 0.8230088495575221, "frac_reward_zero_std": 0.75, "grad_norm": 1.1187358514830699, "kl": 0.01771361753344536, "learning_rate": 9.665962107941724e-07, "loss": -0.2072, "num_tokens": 18736842.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999238848686218, "sampling/importance_sampling_ratio/min": 0.44626033306121826, "sampling/sampling_logp_difference/max": 0.8068528175354004, "sampling/sampling_logp_difference/mean": 0.012701628729701042, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 337.84375, "completions/mean_terminated_length": 337.84375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2385561168193817, "epoch": 0.8247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 1.2245403814646685, "kl": 0.018328964710235596, "learning_rate": 9.663181183415131e-07, "loss": -0.0338, "num_tokens": 18770000.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 0.9997444748878479, "sampling/importance_sampling_ratio/min": 0.2822934687137604, "sampling/sampling_logp_difference/max": 1.2648080587387085, "sampling/sampling_logp_difference/mean": 0.01442661602050066, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 424.609375, "completions/mean_terminated_length": 424.609375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.34191426634788513, "epoch": 0.8265486725663717, "frac_reward_zero_std": 0.5, "grad_norm": 1.2531821144445354, "kl": 0.01747404783964157, "learning_rate": 9.660389134400033e-07, "loss": 0.03, "num_tokens": 18809671.0, "reward": -0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8623849153518677, "sampling/importance_sampling_ratio/mean": 1.0000585317611694, "sampling/importance_sampling_ratio/min": 0.3981071412563324, "sampling/sampling_logp_difference/max": 0.9210340976715088, "sampling/sampling_logp_difference/mean": 0.01515053678303957, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 247.4375, "completions/mean_terminated_length": 247.4375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22452127933502197, "epoch": 0.8283185840707965, "frac_reward_zero_std": 1.0, "grad_norm": 0.07827066468999641, "kl": 0.02029763162136078, "learning_rate": 9.657585967557138e-07, "loss": 0.0002, "num_tokens": 18836275.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998415112495422, "sampling/importance_sampling_ratio/min": 0.18762071430683136, "sampling/sampling_logp_difference/max": 1.6733328104019165, "sampling/sampling_logp_difference/mean": 0.01320183277130127, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3082.0, "completions/max_terminated_length": 3082.0, "completions/mean_length": 802.890625, "completions/mean_terminated_length": 802.890625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.20217928290367126, "epoch": 0.8300884955752212, "frac_reward_zero_std": 0.5, "grad_norm": 0.7332908349275866, "kl": 0.010030319914221764, "learning_rate": 9.654771689573684e-07, "loss": 0.0105, "num_tokens": 18896940.0, "reward": 0.59375, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.9693641662597656, "sampling/importance_sampling_ratio/mean": 1.0002100467681885, "sampling/importance_sampling_ratio/min": 0.07216832041740417, "sampling/sampling_logp_difference/max": 2.628754138946533, "sampling/sampling_logp_difference/mean": 0.009691549465060234, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 482.40625, "completions/mean_terminated_length": 482.40625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.19566302001476288, "epoch": 0.831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0457741774290896, "kl": 0.012574524618685246, "learning_rate": 9.651946307163416e-07, "loss": 0.0428, "num_tokens": 18938422.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8120815753936768, "sampling/importance_sampling_ratio/mean": 1.0003852844238281, "sampling/importance_sampling_ratio/min": 0.2491093873977661, "sampling/sampling_logp_difference/max": 1.3898632526397705, "sampling/sampling_logp_difference/mean": 0.011179201304912567, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 246.140625, "completions/mean_terminated_length": 246.140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.27302926778793335, "epoch": 0.8336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.05890057454539982, "kl": 0.02683423087000847, "learning_rate": 9.64910982706657e-07, "loss": 0.0003, "num_tokens": 18971039.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6274622678756714, "sampling/importance_sampling_ratio/mean": 0.999690055847168, "sampling/importance_sampling_ratio/min": 0.22330127656459808, "sampling/sampling_logp_difference/max": 1.4992334842681885, "sampling/sampling_logp_difference/mean": 0.017249880358576775, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 515.609375, "completions/mean_terminated_length": 515.609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.15505681931972504, "epoch": 0.8353982300884956, "frac_reward_zero_std": 0.5, "grad_norm": 0.943109993604028, "kl": 0.018115878105163574, "learning_rate": 9.646262256049852e-07, "loss": 0.084, "num_tokens": 19012486.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000576972961426, "sampling/importance_sampling_ratio/min": 0.4189411997795105, "sampling/sampling_logp_difference/max": 0.8850901126861572, "sampling/sampling_logp_difference/mean": 0.010877094231545925, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 555.59375, "completions/mean_terminated_length": 555.59375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.19889341294765472, "epoch": 0.8371681415929203, "frac_reward_zero_std": 0.5, "grad_norm": 0.7998764584559843, "kl": 0.015403350815176964, "learning_rate": 9.643403600906432e-07, "loss": -0.0968, "num_tokens": 19057740.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6624536514282227, "sampling/importance_sampling_ratio/mean": 1.000143051147461, "sampling/importance_sampling_ratio/min": 0.5185490250587463, "sampling/sampling_logp_difference/max": 0.6567206382751465, "sampling/sampling_logp_difference/mean": 0.010367901995778084, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 428.203125, "completions/mean_terminated_length": 428.203125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.32210421562194824, "epoch": 0.8389380530973451, "frac_reward_zero_std": 1.0, "grad_norm": 0.46891531309046863, "kl": 0.024751143530011177, "learning_rate": 9.640533868455918e-07, "loss": 0.0003, "num_tokens": 19098505.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 0.9998844861984253, "sampling/importance_sampling_ratio/min": 0.4926930069923401, "sampling/sampling_logp_difference/max": 0.7078690528869629, "sampling/sampling_logp_difference/mean": 0.014749579131603241, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 662.390625, "completions/mean_terminated_length": 662.390625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.2399713546037674, "epoch": 0.8407079646017699, "frac_reward_zero_std": 0.25, "grad_norm": 1.0316196210403366, "kl": 0.016942881047725677, "learning_rate": 9.637653065544349e-07, "loss": -0.1199, "num_tokens": 19152594.0, "reward": -0.25, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.9902808666229248, "sampling/importance_sampling_ratio/mean": 1.0000816583633423, "sampling/importance_sampling_ratio/min": 0.4392775297164917, "sampling/sampling_logp_difference/max": 0.8226239085197449, "sampling/sampling_logp_difference/mean": 0.01251322403550148, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2853.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 632.078125, "completions/mean_terminated_length": 632.078125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1951752007007599, "epoch": 0.8424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 0.7653851903617468, "kl": 0.01914290525019169, "learning_rate": 9.634761199044165e-07, "loss": 0.0072, "num_tokens": 19202999.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999297022819519, "sampling/importance_sampling_ratio/min": 0.4893636405467987, "sampling/sampling_logp_difference/max": 1.2540783882141113, "sampling/sampling_logp_difference/mean": 0.01116904802620411, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 4562.0, "completions/mean_length": 713.234375, "completions/mean_terminated_length": 645.1904907226562, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.26627299189567566, "epoch": 0.8442477876106195, "frac_reward_zero_std": 0.5, "grad_norm": 0.691277622599916, "kl": 0.018843863159418106, "learning_rate": 9.63185827585421e-07, "loss": 0.0543, "num_tokens": 19260966.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6646103858947754, "sampling/importance_sampling_ratio/mean": 0.9997841715812683, "sampling/importance_sampling_ratio/min": 0.2541903555393219, "sampling/sampling_logp_difference/max": 1.3696718215942383, "sampling/sampling_logp_difference/mean": 0.01238088309764862, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 881.390625, "completions/mean_terminated_length": 816.0159301757812, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.15500886738300323, "epoch": 0.8460176991150442, "frac_reward_zero_std": 0.5, "grad_norm": 0.6586560782813836, "kl": 0.01524945255368948, "learning_rate": 9.628944302899695e-07, "loss": 0.1951, "num_tokens": 19328703.0, "reward": 0.46875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999462366104126, "sampling/importance_sampling_ratio/min": 0.14019155502319336, "sampling/sampling_logp_difference/max": 1.9647455215454102, "sampling/sampling_logp_difference/mean": 0.009003754705190659, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 5000.0, "completions/max_terminated_length": 4878.0, "completions/mean_length": 1525.078125, "completions/mean_terminated_length": 1230.59326171875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.29825830459594727, "epoch": 0.8477876106194691, "frac_reward_zero_std": 0.0, "grad_norm": 0.5702362496688065, "kl": 0.01436569169163704, "learning_rate": 9.6260192871322e-07, "loss": 0.2081, "num_tokens": 19437844.0, "reward": 0.1875, "reward_std": 0.8283873796463013, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.946414828300476, "sampling/importance_sampling_ratio/mean": 1.000105381011963, "sampling/importance_sampling_ratio/min": 0.432391881942749, "sampling/sampling_logp_difference/max": 0.8384230136871338, "sampling/sampling_logp_difference/mean": 0.01222523394972086, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 496.109375, "completions/mean_terminated_length": 496.109375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.29266801476478577, "epoch": 0.8495575221238938, "frac_reward_zero_std": 0.5, "grad_norm": 1.235209401486965, "kl": 0.029322022572159767, "learning_rate": 9.623083235529646e-07, "loss": 0.06, "num_tokens": 19481451.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997217059135437, "sampling/importance_sampling_ratio/min": 0.5005985498428345, "sampling/sampling_logp_difference/max": 0.7146363258361816, "sampling/sampling_logp_difference/mean": 0.013745386153459549, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3177.0, "completions/max_terminated_length": 3177.0, "completions/mean_length": 668.859375, "completions/mean_terminated_length": 668.859375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.23595350980758667, "epoch": 0.8513274336283185, "frac_reward_zero_std": 0.5, "grad_norm": 1.1130335108612126, "kl": 0.022035665810108185, "learning_rate": 9.620136155096275e-07, "loss": 0.2344, "num_tokens": 19533762.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6280452013015747, "sampling/importance_sampling_ratio/mean": 0.9998583793640137, "sampling/importance_sampling_ratio/min": 0.22127090394496918, "sampling/sampling_logp_difference/max": 1.5083675384521484, "sampling/sampling_logp_difference/mean": 0.012651799246668816, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 393.3125, "completions/mean_terminated_length": 393.3125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22186538577079773, "epoch": 0.8530973451327434, "frac_reward_zero_std": 0.75, "grad_norm": 0.7232769066329207, "kl": 0.031558118760585785, "learning_rate": 9.617178052862649e-07, "loss": -0.0265, "num_tokens": 19569110.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9607890844345093, "sampling/importance_sampling_ratio/mean": 0.9998571872711182, "sampling/importance_sampling_ratio/min": 0.5138288140296936, "sampling/sampling_logp_difference/max": 0.673346996307373, "sampling/sampling_logp_difference/mean": 0.012910446152091026, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 500.625, "completions/mean_terminated_length": 500.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.26491376757621765, "epoch": 0.8548672566371681, "frac_reward_zero_std": 0.5, "grad_norm": 1.137145737461921, "kl": 0.02350904420018196, "learning_rate": 9.614208935885614e-07, "loss": 0.0701, "num_tokens": 19611854.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8709882497787476, "sampling/importance_sampling_ratio/mean": 0.9998334050178528, "sampling/importance_sampling_ratio/min": 0.3292112946510315, "sampling/sampling_logp_difference/max": 1.1110554933547974, "sampling/sampling_logp_difference/mean": 0.013426532037556171, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1361.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 370.1875, "completions/mean_terminated_length": 370.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3073211908340454, "epoch": 0.856637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.05833056604992103, "kl": 0.02759537287056446, "learning_rate": 9.6112288112483e-07, "loss": 0.0003, "num_tokens": 19647418.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.600754737854004, "sampling/importance_sampling_ratio/mean": 1.000586748123169, "sampling/importance_sampling_ratio/min": 0.5483754873275757, "sampling/sampling_logp_difference/max": 0.6007950305938721, "sampling/sampling_logp_difference/mean": 0.013985146768391132, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 203.828125, "completions/mean_terminated_length": 203.828125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.15378624200820923, "epoch": 0.8584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.15353742056959785, "kl": 0.03410287946462631, "learning_rate": 9.608237686060097e-07, "loss": 0.0003, "num_tokens": 19670543.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.693957805633545, "sampling/importance_sampling_ratio/mean": 0.99981290102005, "sampling/importance_sampling_ratio/min": 0.5038071870803833, "sampling/sampling_logp_difference/max": 0.6855616569519043, "sampling/sampling_logp_difference/mean": 0.010947016067802906, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2414.0, "completions/max_terminated_length": 2414.0, "completions/mean_length": 575.828125, "completions/mean_terminated_length": 575.828125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.1923430860042572, "epoch": 0.8601769911504424, "frac_reward_zero_std": 0.5, "grad_norm": 0.938797698967327, "kl": 0.025751130655407906, "learning_rate": 9.605235567456635e-07, "loss": 0.0807, "num_tokens": 19718356.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000979900360107, "sampling/importance_sampling_ratio/min": 0.2379385381937027, "sampling/sampling_logp_difference/max": 1.4357428550720215, "sampling/sampling_logp_difference/mean": 0.011801760643720627, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3578.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 845.140625, "completions/mean_terminated_length": 845.140625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.20593641698360443, "epoch": 0.8619469026548673, "frac_reward_zero_std": 0.25, "grad_norm": 0.9922392885760689, "kl": 0.022639106959104538, "learning_rate": 9.602222462599766e-07, "loss": 0.1859, "num_tokens": 19782317.0, "reward": 0.75, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.8720849752426147, "sampling/importance_sampling_ratio/mean": 0.9998831152915955, "sampling/importance_sampling_ratio/min": 0.1541758030653, "sampling/sampling_logp_difference/max": 1.869661808013916, "sampling/sampling_logp_difference/mean": 0.01093231700360775, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 1008.3125, "completions/mean_terminated_length": 1008.3125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.2797500491142273, "epoch": 0.863716814159292, "frac_reward_zero_std": 0.25, "grad_norm": 0.7084575407213984, "kl": 0.02128446474671364, "learning_rate": 9.599198378677558e-07, "loss": -0.0204, "num_tokens": 19861409.0, "reward": -0.28125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.9606797695159912, "sampling/importance_sampling_ratio/mean": 0.9999060034751892, "sampling/importance_sampling_ratio/min": 0.3408898115158081, "sampling/sampling_logp_difference/max": 1.0761959552764893, "sampling/sampling_logp_difference/mean": 0.012265896424651146, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 458.40625, "completions/mean_terminated_length": 458.40625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.35442763566970825, "epoch": 0.8654867256637168, "frac_reward_zero_std": 0.5, "grad_norm": 1.4134250442203335, "kl": 0.02135457843542099, "learning_rate": 9.596163322904269e-07, "loss": 0.1598, "num_tokens": 19902955.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6207278966903687, "sampling/importance_sampling_ratio/mean": 0.9998576045036316, "sampling/importance_sampling_ratio/min": 0.382398784160614, "sampling/sampling_logp_difference/max": 0.9612913131713867, "sampling/sampling_logp_difference/mean": 0.014098809100687504, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 292.59375, "completions/mean_terminated_length": 292.59375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.14853447675704956, "epoch": 0.8672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.08583425075891565, "kl": 0.028582872822880745, "learning_rate": 9.593117302520328e-07, "loss": 0.0003, "num_tokens": 19931089.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011471509933472, "sampling/importance_sampling_ratio/min": 0.4828200340270996, "sampling/sampling_logp_difference/max": 1.4382712841033936, "sampling/sampling_logp_difference/mean": 0.010146915912628174, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4629.0, "completions/max_terminated_length": 4629.0, "completions/mean_length": 709.21875, "completions/mean_terminated_length": 709.21875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.16501224040985107, "epoch": 0.8690265486725663, "frac_reward_zero_std": 0.75, "grad_norm": 0.5720321798762249, "kl": 0.03221043199300766, "learning_rate": 9.590060324792325e-07, "loss": -0.0831, "num_tokens": 19986319.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995157122612, "sampling/importance_sampling_ratio/min": 0.42442676424980164, "sampling/sampling_logp_difference/max": 1.0100072622299194, "sampling/sampling_logp_difference/mean": 0.011065034195780754, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 823.546875, "completions/mean_terminated_length": 823.546875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3087630867958069, "epoch": 0.8707964601769912, "frac_reward_zero_std": 0.5, "grad_norm": 0.8884748608140256, "kl": 0.023067547008395195, "learning_rate": 9.58699239701299e-07, "loss": 0.1091, "num_tokens": 20053090.0, "reward": -0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6566325426101685, "sampling/importance_sampling_ratio/mean": 0.9997268915176392, "sampling/importance_sampling_ratio/min": 0.15512597560882568, "sampling/sampling_logp_difference/max": 1.8635177612304688, "sampling/sampling_logp_difference/mean": 0.014490077272057533, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4542.0, "completions/max_terminated_length": 4542.0, "completions/mean_length": 959.046875, "completions/mean_terminated_length": 959.046875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.26736214756965637, "epoch": 0.8725663716814159, "frac_reward_zero_std": 0.25, "grad_norm": 0.7787810504208409, "kl": 0.015000334940850735, "learning_rate": 9.58391352650117e-07, "loss": 0.1045, "num_tokens": 20123989.0, "reward": 0.28125, "reward_std": 0.7561737298965454, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002920627593994, "sampling/importance_sampling_ratio/min": 0.33783572912216187, "sampling/sampling_logp_difference/max": 1.085195541381836, "sampling/sampling_logp_difference/mean": 0.011402166448533535, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 463.953125, "completions/mean_terminated_length": 463.953125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.380501389503479, "epoch": 0.8743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 1.1763092238387642, "kl": 0.028266187757253647, "learning_rate": 9.580823720601823e-07, "loss": 0.1229, "num_tokens": 20167634.0, "reward": 0.15625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6364666223526, "sampling/importance_sampling_ratio/mean": 0.9998792409896851, "sampling/importance_sampling_ratio/min": 0.3372182846069336, "sampling/sampling_logp_difference/max": 1.0870248079299927, "sampling/sampling_logp_difference/mean": 0.017274251207709312, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 229.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.1531234085559845, "epoch": 0.8761061946902655, "frac_reward_zero_std": 1.0, "grad_norm": 0.07510834175019343, "kl": 0.02510974183678627, "learning_rate": 9.57772298668599e-07, "loss": 0.0002, "num_tokens": 20191202.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5011132955551147, "sampling/importance_sampling_ratio/mean": 0.9995670914649963, "sampling/importance_sampling_ratio/min": 0.4175656735897064, "sampling/sampling_logp_difference/max": 0.8733134269714355, "sampling/sampling_logp_difference/mean": 0.011092218570411205, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.16580455005168915, "epoch": 0.8778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.20261235415979437, "kl": 0.04775729775428772, "learning_rate": 9.57461133215079e-07, "loss": 0.0004, "num_tokens": 20214962.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7033768892288208, "sampling/importance_sampling_ratio/mean": 1.0005569458007812, "sampling/importance_sampling_ratio/min": 0.529240071773529, "sampling/sampling_logp_difference/max": 0.6363131999969482, "sampling/sampling_logp_difference/mean": 0.013049274682998657, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 277.671875, "completions/mean_terminated_length": 277.671875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3723715543746948, "epoch": 0.879646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.4652648708534521, "kl": 0.02884167805314064, "learning_rate": 9.57148876441938e-07, "loss": 0.0908, "num_tokens": 20245133.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8738765716552734, "sampling/importance_sampling_ratio/mean": 0.9999635219573975, "sampling/importance_sampling_ratio/min": 0.6171298623085022, "sampling/sampling_logp_difference/max": 0.6280093193054199, "sampling/sampling_logp_difference/mean": 0.017529264092445374, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 622.8125, "completions/mean_terminated_length": 622.8125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.26433929800987244, "epoch": 0.8814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 0.5486815240147482, "kl": 0.022004902362823486, "learning_rate": 9.568355290940966e-07, "loss": -0.12, "num_tokens": 20297569.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5609887838363647, "sampling/importance_sampling_ratio/mean": 0.9994563460350037, "sampling/importance_sampling_ratio/min": 0.25790783762931824, "sampling/sampling_logp_difference/max": 1.35515296459198, "sampling/sampling_logp_difference/mean": 0.013392012566328049, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 494.8125, "completions/mean_terminated_length": 494.8125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2631944417953491, "epoch": 0.8831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.038847610781952376, "kl": 0.023738548159599304, "learning_rate": 9.565210919190763e-07, "loss": 0.0002, "num_tokens": 20339557.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994759559631348, "sampling/importance_sampling_ratio/min": 0.33048567175865173, "sampling/sampling_logp_difference/max": 1.107192039489746, "sampling/sampling_logp_difference/mean": 0.013555317185819149, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3569.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 666.21875, "completions/mean_terminated_length": 666.21875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.32354748249053955, "epoch": 0.8849557522123894, "frac_reward_zero_std": 0.25, "grad_norm": 1.116566605525911, "kl": 0.01879994012415409, "learning_rate": 9.562055656669987e-07, "loss": 0.052, "num_tokens": 20396947.0, "reward": -0.03125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5854055881500244, "sampling/importance_sampling_ratio/mean": 1.0000243186950684, "sampling/importance_sampling_ratio/min": 0.44626033306121826, "sampling/sampling_logp_difference/max": 0.8068528175354004, "sampling/sampling_logp_difference/mean": 0.013729454018175602, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 238.546875, "completions/mean_terminated_length": 238.546875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.2509828805923462, "epoch": 0.8867256637168142, "frac_reward_zero_std": 1.0, "grad_norm": 0.061299953142124566, "kl": 0.024477148428559303, "learning_rate": 9.558889510905835e-07, "loss": 0.0003, "num_tokens": 20424214.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.585755705833435, "sampling/importance_sampling_ratio/mean": 1.0004644393920898, "sampling/importance_sampling_ratio/min": 0.4728452265262604, "sampling/sampling_logp_difference/max": 0.7489871978759766, "sampling/sampling_logp_difference/mean": 0.012459663674235344, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2042524218559265, "epoch": 0.8884955752212389, "frac_reward_zero_std": 0.5, "grad_norm": 2.0667686166486514, "kl": 0.020636608824133873, "learning_rate": 9.555712489451464e-07, "loss": 0.0914, "num_tokens": 20451886.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6307634115219116, "sampling/importance_sampling_ratio/mean": 1.0003743171691895, "sampling/importance_sampling_ratio/min": 0.5425347089767456, "sampling/sampling_logp_difference/max": 0.6115032434463501, "sampling/sampling_logp_difference/mean": 0.013254502788186073, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 453.109375, "completions/mean_terminated_length": 453.109375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.17858102917671204, "epoch": 0.8902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 0.7310950438476714, "kl": 0.017071256414055824, "learning_rate": 9.55252459988598e-07, "loss": 0.0018, "num_tokens": 20491381.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.655517578125, "sampling/importance_sampling_ratio/mean": 0.9998915195465088, "sampling/importance_sampling_ratio/min": 0.2976139783859253, "sampling/sampling_logp_difference/max": 1.2119579315185547, "sampling/sampling_logp_difference/mean": 0.010301743634045124, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3984.0, "completions/max_terminated_length": 3984.0, "completions/mean_length": 892.8125, "completions/mean_terminated_length": 892.8125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.18693150579929352, "epoch": 0.8920353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 0.663415695744934, "kl": 0.012066975235939026, "learning_rate": 9.549325849814418e-07, "loss": 0.0002, "num_tokens": 20560281.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.000130295753479, "sampling/importance_sampling_ratio/min": 0.4887895882129669, "sampling/sampling_logp_difference/max": 0.7158231735229492, "sampling/sampling_logp_difference/mean": 0.009347323328256607, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 448.296875, "completions/mean_terminated_length": 448.296875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1908402144908905, "epoch": 0.8938053097345132, "frac_reward_zero_std": 0.75, "grad_norm": 1.0409039621698664, "kl": 0.020206402987241745, "learning_rate": 9.546116246867713e-07, "loss": -0.1278, "num_tokens": 20599516.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000603199005127, "sampling/importance_sampling_ratio/min": 0.187225803732872, "sampling/sampling_logp_difference/max": 1.6754398345947266, "sampling/sampling_logp_difference/mean": 0.011197139509022236, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3574.0, "completions/max_terminated_length": 3574.0, "completions/mean_length": 584.46875, "completions/mean_terminated_length": 584.46875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.2717937231063843, "epoch": 0.8955752212389381, "frac_reward_zero_std": 0.5, "grad_norm": 1.0055553740705234, "kl": 0.02126394957304001, "learning_rate": 9.542895798702701e-07, "loss": -0.1258, "num_tokens": 20647274.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000284910202026, "sampling/importance_sampling_ratio/min": 0.2912392318248749, "sampling/sampling_logp_difference/max": 1.2336102724075317, "sampling/sampling_logp_difference/mean": 0.011695046909153461, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 159.59375, "completions/mean_terminated_length": 159.59375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.13945519924163818, "epoch": 0.8973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 1.8795466246857886, "kl": 0.019293267279863358, "learning_rate": 9.539664513002084e-07, "loss": 0.0088, "num_tokens": 20667216.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9457426071166992, "sampling/importance_sampling_ratio/mean": 1.0001296997070312, "sampling/importance_sampling_ratio/min": 0.4528990089893341, "sampling/sampling_logp_difference/max": 0.792086124420166, "sampling/sampling_logp_difference/mean": 0.011546816676855087, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 287.828125, "completions/mean_terminated_length": 287.828125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.18233928084373474, "epoch": 0.8991150442477877, "frac_reward_zero_std": 0.75, "grad_norm": 1.7038957304940952, "kl": 0.022625073790550232, "learning_rate": 9.536422397474418e-07, "loss": 0.115, "num_tokens": 20694901.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.578678846359253, "sampling/importance_sampling_ratio/mean": 0.9999147057533264, "sampling/importance_sampling_ratio/min": 0.35862812399864197, "sampling/sampling_logp_difference/max": 1.025469422340393, "sampling/sampling_logp_difference/mean": 0.012360571883618832, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 227.96875, "completions/mean_terminated_length": 227.96875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.262223482131958, "epoch": 0.9008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810165234296274, "kl": 0.02398998849093914, "learning_rate": 9.533169459854098e-07, "loss": 0.0003, "num_tokens": 20720115.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999184012413025, "sampling/importance_sampling_ratio/min": 0.5238364934921265, "sampling/sampling_logp_difference/max": 0.7698227167129517, "sampling/sampling_logp_difference/mean": 0.013909758999943733, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 533.703125, "completions/mean_terminated_length": 533.703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2980627119541168, "epoch": 0.9026548672566371, "frac_reward_zero_std": 0.25, "grad_norm": 1.7258051394628993, "kl": 0.01877797208726406, "learning_rate": 9.529905707901333e-07, "loss": 0.1157, "num_tokens": 20764656.0, "reward": 0.5625, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6207484006881714, "sampling/importance_sampling_ratio/mean": 0.9999284148216248, "sampling/importance_sampling_ratio/min": 0.551980197429657, "sampling/sampling_logp_difference/max": 0.594243049621582, "sampling/sampling_logp_difference/mean": 0.013655584305524826, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 281.8125, "completions/mean_terminated_length": 281.8125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.24901863932609558, "epoch": 0.904424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.6624205424065062, "kl": 0.023065539076924324, "learning_rate": 9.526631149402134e-07, "loss": 0.0099, "num_tokens": 20794548.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.909961462020874, "sampling/importance_sampling_ratio/mean": 0.9998233914375305, "sampling/importance_sampling_ratio/min": 0.37652334570884705, "sampling/sampling_logp_difference/max": 0.9767752885818481, "sampling/sampling_logp_difference/mean": 0.012988662347197533, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 379.515625, "completions/mean_terminated_length": 379.515625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2605680227279663, "epoch": 0.9061946902654867, "frac_reward_zero_std": 1.0, "grad_norm": 0.032075335804240955, "kl": 0.01992485299706459, "learning_rate": 9.523345792168288e-07, "loss": 0.0002, "num_tokens": 20828645.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.706834077835083, "sampling/importance_sampling_ratio/mean": 0.9999606013298035, "sampling/importance_sampling_ratio/min": 0.49221864342689514, "sampling/sampling_logp_difference/max": 0.7088322639465332, "sampling/sampling_logp_difference/mean": 0.012871419079601765, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 265.484375, "completions/mean_terminated_length": 265.484375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.22514209151268005, "epoch": 0.9079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.3613412730411028, "kl": 0.023211799561977386, "learning_rate": 9.520049644037347e-07, "loss": 0.2754, "num_tokens": 20856820.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6048742532730103, "sampling/importance_sampling_ratio/mean": 1.000026822090149, "sampling/importance_sampling_ratio/min": 0.3345174193382263, "sampling/sampling_logp_difference/max": 1.0950663089752197, "sampling/sampling_logp_difference/mean": 0.01329261064529419, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 370.59375, "completions/mean_terminated_length": 370.59375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2572430968284607, "epoch": 0.9097345132743363, "frac_reward_zero_std": 0.5, "grad_norm": 1.459750618496502, "kl": 0.018980465829372406, "learning_rate": 9.516742712872605e-07, "loss": -0.0066, "num_tokens": 20898618.0, "reward": 0.4375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00021231174469, "sampling/importance_sampling_ratio/min": 0.4910165071487427, "sampling/sampling_logp_difference/max": 0.7280449867248535, "sampling/sampling_logp_difference/mean": 0.012723185122013092, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 364.484375, "completions/mean_terminated_length": 364.484375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.17833691835403442, "epoch": 0.911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.03726561459343124, "kl": 0.0227162204682827, "learning_rate": 9.513425006563078e-07, "loss": 0.0002, "num_tokens": 20931673.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6604782342910767, "sampling/importance_sampling_ratio/mean": 1.000342845916748, "sampling/importance_sampling_ratio/min": 0.48757144808769226, "sampling/sampling_logp_difference/max": 0.7183184623718262, "sampling/sampling_logp_difference/mean": 0.010283341631293297, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 272.109375, "completions/mean_terminated_length": 272.109375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2885431945323944, "epoch": 0.9132743362831859, "frac_reward_zero_std": 0.75, "grad_norm": 1.3880553019554223, "kl": 0.029161231592297554, "learning_rate": 9.51009653302349e-07, "loss": 0.0929, "num_tokens": 20961216.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999738335609436, "sampling/importance_sampling_ratio/min": 0.3853294253349304, "sampling/sampling_logp_difference/max": 0.9536566734313965, "sampling/sampling_logp_difference/mean": 0.016232019290328026, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 466.296875, "completions/mean_terminated_length": 466.296875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.21986886858940125, "epoch": 0.9150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 1.0640080584712213, "kl": 0.017498716711997986, "learning_rate": 9.506757300194248e-07, "loss": 0.0858, "num_tokens": 21002867.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000202655792236, "sampling/importance_sampling_ratio/min": 0.4807683825492859, "sampling/sampling_logp_difference/max": 0.7323696613311768, "sampling/sampling_logp_difference/mean": 0.01144690066576004, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 287.65625, "completions/mean_terminated_length": 287.65625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2133372724056244, "epoch": 0.9168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 1.4431468422152827, "kl": 0.025945238769054413, "learning_rate": 9.50340731604143e-07, "loss": 0.1932, "num_tokens": 21030845.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001201629638672, "sampling/importance_sampling_ratio/min": 0.32297107577323914, "sampling/sampling_logp_difference/max": 1.130192518234253, "sampling/sampling_logp_difference/mean": 0.014624932780861855, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 253.171875, "completions/mean_terminated_length": 253.171875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.15925630927085876, "epoch": 0.9185840707964602, "frac_reward_zero_std": 0.75, "grad_norm": 1.2472722839248847, "kl": 0.023008838295936584, "learning_rate": 9.500046588556761e-07, "loss": -0.0211, "num_tokens": 21056440.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8941967487335205, "sampling/importance_sampling_ratio/mean": 1.0001165866851807, "sampling/importance_sampling_ratio/min": 0.43685463070869446, "sampling/sampling_logp_difference/max": 0.8281548023223877, "sampling/sampling_logp_difference/mean": 0.011363551020622253, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 471.953125, "completions/mean_terminated_length": 471.953125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.18254661560058594, "epoch": 0.9203539823008849, "frac_reward_zero_std": 0.5, "grad_norm": 1.0389289601930534, "kl": 0.016383303329348564, "learning_rate": 9.496675125757594e-07, "loss": 0.084, "num_tokens": 21096533.0, "reward": -0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.896669626235962, "sampling/importance_sampling_ratio/mean": 0.9999995231628418, "sampling/importance_sampling_ratio/min": 0.4861721992492676, "sampling/sampling_logp_difference/max": 0.7211923599243164, "sampling/sampling_logp_difference/mean": 0.010693486779928207, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 502.453125, "completions/mean_terminated_length": 502.453125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.17277035117149353, "epoch": 0.9221238938053097, "frac_reward_zero_std": 0.25, "grad_norm": 1.2155602204284752, "kl": 0.020302508026361465, "learning_rate": 9.493292935686894e-07, "loss": -0.0147, "num_tokens": 21137954.0, "reward": -0.03125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001676082611084, "sampling/importance_sampling_ratio/min": 0.06681614369153976, "sampling/sampling_logp_difference/max": 2.705810546875, "sampling/sampling_logp_difference/mean": 0.0116327665746212, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 219.578125, "completions/mean_terminated_length": 219.578125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1833188384771347, "epoch": 0.9238938053097345, "frac_reward_zero_std": 0.75, "grad_norm": 1.5421232222577277, "kl": 0.025663238018751144, "learning_rate": 9.489900026413216e-07, "loss": 0.0024, "num_tokens": 21161479.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.70711088180542, "sampling/importance_sampling_ratio/mean": 1.0002272129058838, "sampling/importance_sampling_ratio/min": 0.30250707268714905, "sampling/sampling_logp_difference/max": 1.195650577545166, "sampling/sampling_logp_difference/mean": 0.012792345136404037, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.27441734075546265, "epoch": 0.9256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.1172233415070547, "kl": 0.025678176432847977, "learning_rate": 9.486496406030685e-07, "loss": 0.0297, "num_tokens": 21191175.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5946173667907715, "sampling/importance_sampling_ratio/mean": 1.0004942417144775, "sampling/importance_sampling_ratio/min": 0.511827290058136, "sampling/sampling_logp_difference/max": 0.6697680950164795, "sampling/sampling_logp_difference/mean": 0.01495583076030016, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 563.640625, "completions/mean_terminated_length": 563.640625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.27270397543907166, "epoch": 0.9274336283185841, "frac_reward_zero_std": 0.0, "grad_norm": 1.266778774089369, "kl": 0.020550522953271866, "learning_rate": 9.483082082658982e-07, "loss": 0.0996, "num_tokens": 21243344.0, "reward": 0.59375, "reward_std": 0.7744960784912109, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999998807907104, "sampling/importance_sampling_ratio/min": 0.5246801376342773, "sampling/sampling_logp_difference/max": 0.7540783882141113, "sampling/sampling_logp_difference/mean": 0.013334444724023342, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 285.296875, "completions/mean_terminated_length": 285.296875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2499884068965912, "epoch": 0.9292035398230089, "frac_reward_zero_std": 0.75, "grad_norm": 1.02736322923442, "kl": 0.028092876076698303, "learning_rate": 9.479657064443321e-07, "loss": -0.0152, "num_tokens": 21272979.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.94199538230896, "sampling/importance_sampling_ratio/mean": 1.000009298324585, "sampling/importance_sampling_ratio/min": 0.31634464859962463, "sampling/sampling_logp_difference/max": 1.1509230136871338, "sampling/sampling_logp_difference/mean": 0.01449848897755146, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 375.21875, "completions/mean_terminated_length": 375.21875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2833993434906006, "epoch": 0.9309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.0936649013563684, "kl": 0.021095579490065575, "learning_rate": 9.476221359554423e-07, "loss": 0.0108, "num_tokens": 21309345.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.864753007888794, "sampling/importance_sampling_ratio/mean": 0.999671459197998, "sampling/importance_sampling_ratio/min": 0.30697447061538696, "sampling/sampling_logp_difference/max": 1.1809906959533691, "sampling/sampling_logp_difference/mean": 0.014631031081080437, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1646.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 396.078125, "completions/mean_terminated_length": 396.078125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.268634170293808, "epoch": 0.9327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 0.9005100667555918, "kl": 0.02202232927083969, "learning_rate": 9.472774976188513e-07, "loss": -0.038, "num_tokens": 21346422.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6294962167739868, "sampling/importance_sampling_ratio/mean": 1.0000548362731934, "sampling/importance_sampling_ratio/min": 0.3007786273956299, "sampling/sampling_logp_difference/max": 1.201380729675293, "sampling/sampling_logp_difference/mean": 0.01456962339580059, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3527.0, "completions/max_terminated_length": 3527.0, "completions/mean_length": 502.609375, "completions/mean_terminated_length": 502.609375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.37191182374954224, "epoch": 0.9345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 0.9464645561585285, "kl": 0.024493016302585602, "learning_rate": 9.469317922567286e-07, "loss": -0.0854, "num_tokens": 21395357.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.597666621208191, "sampling/importance_sampling_ratio/mean": 1.0002484321594238, "sampling/importance_sampling_ratio/min": 0.5128141045570374, "sampling/sampling_logp_difference/max": 0.667841911315918, "sampling/sampling_logp_difference/mean": 0.015820709988474846, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 556.5625, "completions/mean_terminated_length": 556.5625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3464619219303131, "epoch": 0.9362831858407079, "frac_reward_zero_std": 0.5, "grad_norm": 1.0443091859398246, "kl": 0.024154746904969215, "learning_rate": 9.465850206937887e-07, "loss": -0.0096, "num_tokens": 21443777.0, "reward": -0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7615402936935425, "sampling/importance_sampling_ratio/mean": 0.9996815919876099, "sampling/importance_sampling_ratio/min": 0.12583321332931519, "sampling/sampling_logp_difference/max": 2.072798013687134, "sampling/sampling_logp_difference/mean": 0.014616015367209911, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 532.5625, "completions/mean_terminated_length": 532.5625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.20331741869449615, "epoch": 0.9380530973451328, "frac_reward_zero_std": 0.5, "grad_norm": 1.12313284409312, "kl": 0.01774846389889717, "learning_rate": 9.462371837572906e-07, "loss": -0.0681, "num_tokens": 21489589.0, "reward": 0.5, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6582591533660889, "sampling/importance_sampling_ratio/mean": 0.9999247789382935, "sampling/importance_sampling_ratio/min": 0.3758205771446228, "sampling/sampling_logp_difference/max": 0.9786434769630432, "sampling/sampling_logp_difference/mean": 0.012230996042490005, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 286.640625, "completions/mean_terminated_length": 286.640625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.269568532705307, "epoch": 0.9398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 1.9943164535112692, "kl": 0.020269380882382393, "learning_rate": 9.45888282277034e-07, "loss": 0.0348, "num_tokens": 21524494.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6286609172821045, "sampling/importance_sampling_ratio/mean": 1.000028371810913, "sampling/importance_sampling_ratio/min": 0.3202432692050934, "sampling/sampling_logp_difference/max": 1.1386743783950806, "sampling/sampling_logp_difference/mean": 0.013339969329535961, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 582.734375, "completions/mean_terminated_length": 582.734375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.2859712839126587, "epoch": 0.9415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.7113566220061817, "kl": 0.022886380553245544, "learning_rate": 9.455383170853585e-07, "loss": 0.0471, "num_tokens": 21573613.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7381569147109985, "sampling/importance_sampling_ratio/mean": 0.9996509552001953, "sampling/importance_sampling_ratio/min": 0.4597215950489044, "sampling/sampling_logp_difference/max": 0.7771341800689697, "sampling/sampling_logp_difference/mean": 0.012271612882614136, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 376.359375, "completions/mean_terminated_length": 376.359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.20858976244926453, "epoch": 0.9433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 1.111623932047464, "kl": 0.027697864919900894, "learning_rate": 9.451872890171419e-07, "loss": -0.0615, "num_tokens": 21617780.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997859001159668, "sampling/importance_sampling_ratio/min": 0.2685186564922333, "sampling/sampling_logp_difference/max": 1.3148349523544312, "sampling/sampling_logp_difference/mean": 0.013691119849681854, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.27340298891067505, "epoch": 0.9451327433628318, "frac_reward_zero_std": 0.5, "grad_norm": 2.0746949995492296, "kl": 0.02561030164361, "learning_rate": 9.448351989097962e-07, "loss": 0.1171, "num_tokens": 21647308.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6170252561569214, "sampling/importance_sampling_ratio/mean": 0.9997333288192749, "sampling/importance_sampling_ratio/min": 0.3439459502696991, "sampling/sampling_logp_difference/max": 1.0672707557678223, "sampling/sampling_logp_difference/mean": 0.015955470502376556, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 369.5625, "completions/mean_terminated_length": 369.5625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.34255555272102356, "epoch": 0.9469026548672567, "frac_reward_zero_std": 0.5, "grad_norm": 1.3596016956818258, "kl": 0.02321462705731392, "learning_rate": 9.444820476032685e-07, "loss": 0.0338, "num_tokens": 21682976.0, "reward": 0.125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.818648338317871, "sampling/importance_sampling_ratio/mean": 0.9995927810668945, "sampling/importance_sampling_ratio/min": 0.47599557042121887, "sampling/sampling_logp_difference/max": 0.7423467636108398, "sampling/sampling_logp_difference/mean": 0.01575983315706253, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 141.8125, "completions/mean_terminated_length": 141.8125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.12443335354328156, "epoch": 0.9486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.4157713824464641, "kl": 0.03436177596449852, "learning_rate": 9.441278359400364e-07, "loss": 0.0003, "num_tokens": 21701732.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996136426925659, "sampling/importance_sampling_ratio/min": 0.4161679744720459, "sampling/sampling_logp_difference/max": 0.8766663074493408, "sampling/sampling_logp_difference/mean": 0.012168042361736298, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3478095531463623, "epoch": 0.9504424778761061, "frac_reward_zero_std": 1.0, "grad_norm": 0.05059723556364282, "kl": 0.02721760794520378, "learning_rate": 9.437725647651078e-07, "loss": 0.0003, "num_tokens": 21730748.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99959397315979, "sampling/importance_sampling_ratio/min": 0.48076334595680237, "sampling/sampling_logp_difference/max": 0.9805173873901367, "sampling/sampling_logp_difference/mean": 0.016803959384560585, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 444.796875, "completions/mean_terminated_length": 444.796875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3063478469848633, "epoch": 0.952212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 1.3289012697815763, "kl": 0.025018278509378433, "learning_rate": 9.434162349260178e-07, "loss": -0.2272, "num_tokens": 21770031.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.817976713180542, "sampling/importance_sampling_ratio/mean": 0.9999020099639893, "sampling/importance_sampling_ratio/min": 0.34005436301231384, "sampling/sampling_logp_difference/max": 1.0786497592926025, "sampling/sampling_logp_difference/mean": 0.015159091912209988, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 484.96875, "completions/mean_terminated_length": 484.96875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3148280084133148, "epoch": 0.9539823008849557, "frac_reward_zero_std": 0.5, "grad_norm": 1.3661618075924402, "kl": 0.023418953642249107, "learning_rate": 9.430588472728269e-07, "loss": -0.1075, "num_tokens": 21812205.0, "reward": 0.59375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.8570363521575928, "sampling/importance_sampling_ratio/mean": 0.9998496770858765, "sampling/importance_sampling_ratio/min": 0.08608989417552948, "sampling/sampling_logp_difference/max": 2.4523632526397705, "sampling/sampling_logp_difference/mean": 0.014677565544843674, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 324.203125, "completions/mean_terminated_length": 324.203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2846929430961609, "epoch": 0.9557522123893806, "frac_reward_zero_std": 0.5, "grad_norm": 1.6202712818309724, "kl": 0.023219114169478416, "learning_rate": 9.427004026581196e-07, "loss": 0.2403, "num_tokens": 21843098.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004980564117432, "sampling/importance_sampling_ratio/min": 0.19272294640541077, "sampling/sampling_logp_difference/max": 1.6465015411376953, "sampling/sampling_logp_difference/mean": 0.01398945041000843, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 452.9375, "completions/mean_terminated_length": 452.9375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.24589470028877258, "epoch": 0.9575221238938053, "frac_reward_zero_std": 0.75, "grad_norm": 0.8349928962881076, "kl": 0.01809200644493103, "learning_rate": 9.423409019370014e-07, "loss": -0.0355, "num_tokens": 21883062.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7520596981048584, "sampling/importance_sampling_ratio/mean": 0.9997550249099731, "sampling/importance_sampling_ratio/min": 0.4614878296852112, "sampling/sampling_logp_difference/max": 0.7732995748519897, "sampling/sampling_logp_difference/mean": 0.012460077181458473, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 379.203125, "completions/mean_terminated_length": 379.203125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.18397878110408783, "epoch": 0.95929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.0203814099560193, "kl": 0.02188866212964058, "learning_rate": 9.419803459670979e-07, "loss": -0.0276, "num_tokens": 21917971.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002720355987549, "sampling/importance_sampling_ratio/min": 0.29257097840309143, "sampling/sampling_logp_difference/max": 1.2290480136871338, "sampling/sampling_logp_difference/mean": 0.013033962808549404, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 178.90625, "completions/mean_terminated_length": 178.90625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.1637686938047409, "epoch": 0.9610619469026549, "frac_reward_zero_std": 0.75, "grad_norm": 2.6441499583647214, "kl": 0.02739560976624489, "learning_rate": 9.416187356085512e-07, "loss": 0.027, "num_tokens": 21938909.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9695477485656738, "sampling/importance_sampling_ratio/mean": 1.0001466274261475, "sampling/importance_sampling_ratio/min": 0.5504536628723145, "sampling/sampling_logp_difference/max": 0.6778039932250977, "sampling/sampling_logp_difference/mean": 0.013573607429862022, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 176.390625, "completions/mean_terminated_length": 176.390625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.1320582926273346, "epoch": 0.9628318584070796, "frac_reward_zero_std": 1.0, "grad_norm": 0.07132167841710013, "kl": 0.025306664407253265, "learning_rate": 9.412560717240195e-07, "loss": 0.0002, "num_tokens": 21960150.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6979695558547974, "sampling/importance_sampling_ratio/mean": 1.0000154972076416, "sampling/importance_sampling_ratio/min": 0.5702304840087891, "sampling/sampling_logp_difference/max": 0.5617146492004395, "sampling/sampling_logp_difference/mean": 0.011860273778438568, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 323.859375, "completions/mean_terminated_length": 323.859375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.33289211988449097, "epoch": 0.9646017699115044, "frac_reward_zero_std": 0.75, "grad_norm": 0.7318094478451347, "kl": 0.033304162323474884, "learning_rate": 9.408923551786742e-07, "loss": -0.0369, "num_tokens": 21993981.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999862551689148, "sampling/importance_sampling_ratio/min": 0.5013141632080078, "sampling/sampling_logp_difference/max": 1.0378296375274658, "sampling/sampling_logp_difference/mean": 0.01616358570754528, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 527.828125, "completions/mean_terminated_length": 527.828125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.28897106647491455, "epoch": 0.9663716814159292, "frac_reward_zero_std": 0.25, "grad_norm": 1.1403832043970759, "kl": 0.02006874978542328, "learning_rate": 9.405275868401974e-07, "loss": 0.0095, "num_tokens": 22041874.0, "reward": 0.28125, "reward_std": 0.6505630612373352, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6463189125061035, "sampling/importance_sampling_ratio/mean": 0.9999964833259583, "sampling/importance_sampling_ratio/min": 0.30695831775665283, "sampling/sampling_logp_difference/max": 1.1810433864593506, "sampling/sampling_logp_difference/mean": 0.014173662289977074, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 310.359375, "completions/mean_terminated_length": 310.359375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18950799107551575, "epoch": 0.968141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 1.810972857021714, "kl": 0.01892077922821045, "learning_rate": 9.40161767578781e-07, "loss": 0.0435, "num_tokens": 22071289.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997496604919434, "sampling/importance_sampling_ratio/min": 0.3650587499141693, "sampling/sampling_logp_difference/max": 1.0076969861984253, "sampling/sampling_logp_difference/mean": 0.013722660019993782, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 380.828125, "completions/mean_terminated_length": 380.828125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2410738170146942, "epoch": 0.9699115044247788, "frac_reward_zero_std": 0.75, "grad_norm": 1.0179757731527428, "kl": 0.019469907507300377, "learning_rate": 9.397948982671236e-07, "loss": 0.0775, "num_tokens": 22106494.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.901466727256775, "sampling/importance_sampling_ratio/mean": 1.000008225440979, "sampling/importance_sampling_ratio/min": 0.5966559648513794, "sampling/sampling_logp_difference/max": 0.6426255702972412, "sampling/sampling_logp_difference/mean": 0.012975430116057396, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 481.734375, "completions/mean_terminated_length": 481.734375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.17240966856479645, "epoch": 0.9716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 0.7383032500525908, "kl": 0.016760703176259995, "learning_rate": 9.394269797804288e-07, "loss": -0.0045, "num_tokens": 22150013.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6964372396469116, "sampling/importance_sampling_ratio/mean": 1.0000250339508057, "sampling/importance_sampling_ratio/min": 0.43722039461135864, "sampling/sampling_logp_difference/max": 0.8273179531097412, "sampling/sampling_logp_difference/mean": 0.010128960944712162, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 222.203125, "completions/mean_terminated_length": 222.203125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.1685633361339569, "epoch": 0.9734513274336283, "frac_reward_zero_std": 0.75, "grad_norm": 1.61073986467059, "kl": 0.021445488557219505, "learning_rate": 9.390580129964035e-07, "loss": 0.0117, "num_tokens": 22173434.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995536804199219, "sampling/importance_sampling_ratio/min": 0.2623375952243805, "sampling/sampling_logp_difference/max": 1.338123083114624, "sampling/sampling_logp_difference/mean": 0.013086425140500069, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 352.578125, "completions/mean_terminated_length": 352.578125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.25274938344955444, "epoch": 0.9752212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 1.349249325940574, "kl": 0.02821458876132965, "learning_rate": 9.386879987952549e-07, "loss": -0.0244, "num_tokens": 22206511.0, "reward": 0.375, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000123977661133, "sampling/importance_sampling_ratio/min": 0.48661288619041443, "sampling/sampling_logp_difference/max": 0.7515044212341309, "sampling/sampling_logp_difference/mean": 0.014286357909440994, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 423.828125, "completions/mean_terminated_length": 423.828125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2747555375099182, "epoch": 0.9769911504424779, "frac_reward_zero_std": 0.75, "grad_norm": 0.8595936242044729, "kl": 0.015231309458613396, "learning_rate": 9.383169380596892e-07, "loss": -0.1027, "num_tokens": 22246116.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.9050252437591553, "sampling/importance_sampling_ratio/mean": 0.9999040961265564, "sampling/importance_sampling_ratio/min": 0.4885192811489105, "sampling/sampling_logp_difference/max": 0.7163763046264648, "sampling/sampling_logp_difference/mean": 0.013840913772583008, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 386.984375, "completions/mean_terminated_length": 386.984375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.20818987488746643, "epoch": 0.9787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 1.203933766091755, "kl": 0.013940658420324326, "learning_rate": 9.37944831674909e-07, "loss": 0.06, "num_tokens": 22280691.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.7040212154388428, "sampling/importance_sampling_ratio/mean": 0.9996476173400879, "sampling/importance_sampling_ratio/min": 0.40576204657554626, "sampling/sampling_logp_difference/max": 0.9019883871078491, "sampling/sampling_logp_difference/mean": 0.011979883536696434, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 495.078125, "completions/mean_terminated_length": 495.078125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.13084039092063904, "epoch": 0.9805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 0.8629391584525886, "kl": 0.012989269569516182, "learning_rate": 9.37571680528612e-07, "loss": 0.042, "num_tokens": 22322328.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997462630271912, "sampling/importance_sampling_ratio/min": 0.35328760743141174, "sampling/sampling_logp_difference/max": 1.0404728651046753, "sampling/sampling_logp_difference/mean": 0.008749352768063545, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 385.015625, "completions/mean_terminated_length": 385.015625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.25528523325920105, "epoch": 0.9823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.027092693044485958, "kl": 0.014969815500080585, "learning_rate": 9.371974855109874e-07, "loss": 0.0001, "num_tokens": 22359881.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003429651260376, "sampling/importance_sampling_ratio/min": 0.4761839807033539, "sampling/sampling_logp_difference/max": 1.2099828720092773, "sampling/sampling_logp_difference/mean": 0.013863705098628998, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 432.296875, "completions/mean_terminated_length": 432.296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.15575763583183289, "epoch": 0.984070796460177, "frac_reward_zero_std": 0.5, "grad_norm": 1.007159888085651, "kl": 0.0174567848443985, "learning_rate": 9.368222475147153e-07, "loss": 0.008, "num_tokens": 22397356.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997878074645996, "sampling/importance_sampling_ratio/min": 0.35683271288871765, "sampling/sampling_logp_difference/max": 1.1296796798706055, "sampling/sampling_logp_difference/mean": 0.00950632430613041, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 488.140625, "completions/mean_terminated_length": 488.140625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.3266792893409729, "epoch": 0.9858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 1.2082352565110033, "kl": 0.018449243158102036, "learning_rate": 9.36445967434964e-07, "loss": -0.0484, "num_tokens": 22444149.0, "reward": 0.3125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99976646900177, "sampling/importance_sampling_ratio/min": 0.5354207158088684, "sampling/sampling_logp_difference/max": 1.0001440048217773, "sampling/sampling_logp_difference/mean": 0.015226746909320354, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 431.5, "completions/mean_terminated_length": 431.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.17504915595054626, "epoch": 0.9876106194690265, "frac_reward_zero_std": 1.0, "grad_norm": 0.040228401911457955, "kl": 0.019448310136795044, "learning_rate": 9.360686461693872e-07, "loss": 0.0001, "num_tokens": 22482037.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7710506916046143, "sampling/importance_sampling_ratio/mean": 1.0000145435333252, "sampling/importance_sampling_ratio/min": 0.35173001885414124, "sampling/sampling_logp_difference/max": 1.044891357421875, "sampling/sampling_logp_difference/mean": 0.012682083994150162, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 274.1875, "completions/mean_terminated_length": 274.1875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.29115474224090576, "epoch": 0.9893805309734514, "frac_reward_zero_std": 0.75, "grad_norm": 1.5566081718758167, "kl": 0.019338827580213547, "learning_rate": 9.356902846181228e-07, "loss": 0.0063, "num_tokens": 22511617.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.632702350616455, "sampling/importance_sampling_ratio/mean": 0.9999765157699585, "sampling/importance_sampling_ratio/min": 0.474845290184021, "sampling/sampling_logp_difference/max": 0.7447662353515625, "sampling/sampling_logp_difference/mean": 0.016085583716630936, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1325.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 329.84375, "completions/mean_terminated_length": 329.84375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2929571866989136, "epoch": 0.9911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 1.1626840352605465, "kl": 0.01987423375248909, "learning_rate": 9.353108836837905e-07, "loss": 0.0227, "num_tokens": 22545399.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999916553497314, "sampling/importance_sampling_ratio/min": 0.4704146087169647, "sampling/sampling_logp_difference/max": 0.7541408538818359, "sampling/sampling_logp_difference/mean": 0.015192048624157906, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 512.65625, "completions/mean_terminated_length": 512.65625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.24373102188110352, "epoch": 0.9929203539823008, "frac_reward_zero_std": 0.75, "grad_norm": 0.8299025814775202, "kl": 0.016707338392734528, "learning_rate": 9.349304442714895e-07, "loss": 0.0517, "num_tokens": 22591169.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005568265914917, "sampling/importance_sampling_ratio/min": 0.13559646904468536, "sampling/sampling_logp_difference/max": 1.9980719089508057, "sampling/sampling_logp_difference/mean": 0.012626406736671925, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 257.78125, "completions/mean_terminated_length": 257.78125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.19387544691562653, "epoch": 0.9946902654867257, "frac_reward_zero_std": 0.75, "grad_norm": 1.4669714257187554, "kl": 0.02043512836098671, "learning_rate": 9.345489672887962e-07, "loss": -0.1775, "num_tokens": 22617779.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000129222869873, "sampling/importance_sampling_ratio/min": 0.19449982047080994, "sampling/sampling_logp_difference/max": 1.6373240947723389, "sampling/sampling_logp_difference/mean": 0.013507585972547531, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 265.84375, "completions/mean_terminated_length": 265.84375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1400524079799652, "epoch": 0.9964601769911504, "frac_reward_zero_std": 1.0, "grad_norm": 0.06874304836488995, "kl": 0.025154393166303635, "learning_rate": 9.341664536457625e-07, "loss": 0.0002, "num_tokens": 22644553.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9440027475357056, "sampling/importance_sampling_ratio/mean": 0.9996024370193481, "sampling/importance_sampling_ratio/min": 0.48195815086364746, "sampling/sampling_logp_difference/max": 0.7298979759216309, "sampling/sampling_logp_difference/mean": 0.011824633926153183, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 519.515625, "completions/mean_terminated_length": 519.515625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.37813642621040344, "epoch": 0.9982300884955753, "frac_reward_zero_std": 0.0, "grad_norm": 1.5052711268358359, "kl": 0.01573239453136921, "learning_rate": 9.337829042549133e-07, "loss": 0.0227, "num_tokens": 22688586.0, "reward": 0.625, "reward_std": 0.7824782133102417, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005089044570923, "sampling/importance_sampling_ratio/min": 0.41629907488822937, "sampling/sampling_logp_difference/max": 0.8763513565063477, "sampling/sampling_logp_difference/mean": 0.017795484513044357, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 415.390625, "completions/mean_terminated_length": 342.61907958984375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.19897086918354034, "epoch": 1.0, "frac_reward_zero_std": 0.5, "grad_norm": 1.4051253626549454, "kl": 0.016793671995401382, "learning_rate": 9.33398320031244e-07, "loss": -0.0514, "num_tokens": 22724963.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000076293945312, "sampling/importance_sampling_ratio/min": 0.40962710976600647, "sampling/sampling_logp_difference/max": 0.8925080299377441, "sampling/sampling_logp_difference/mean": 0.012449059635400772, "step": 565 } ], "logging_steps": 1, "max_steps": 2260, "num_input_tokens_seen": 22724963, "num_train_epochs": 4, "save_steps": 565, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }