{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5625, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 199.6, "completions/max_terminated_length": 134.0, "completions/mean_length": 171.9, "completions/mean_terminated_length": 122.36666870117188, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06970996516756714, "epoch": 0.03125, "frac_reward_zero_std": 0.6, "grad_norm": 3.630038261413574, "kl": 0.00014932112862879875, "learning_rate": 4.92e-06, "loss": 0.029165178537368774, "num_tokens": 15758.0, "reward": -0.31389998495578764, "reward_std": 0.2122000053524971, "rewards/reward_func/mean": -0.31389998495578764, "rewards/reward_func/std": 0.21219999492168426, "step": 5, "step_time": 14.728857926794444, "tools/call_frequency": 3.45, "tools/failure_frequency": 0.21573015451431274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.6, "completions/max_terminated_length": 168.6, "completions/mean_length": 148.3, "completions/mean_terminated_length": 148.3, "completions/min_length": 129.6, "completions/min_terminated_length": 129.6, "entropy": 0.042718362715095284, "epoch": 0.0625, "frac_reward_zero_std": 0.2, "grad_norm": 3.325033187866211, "kl": 0.037860750965774057, "learning_rate": 4.8200000000000004e-06, "loss": -0.011221970617771148, "num_tokens": 31053.0, "reward": 0.2989000082015991, "reward_std": 0.4415143087506294, "rewards/reward_func/mean": 0.2989000082015991, "rewards/reward_func/std": 0.4415143221616745, "step": 10, "step_time": 9.975367512006779, "tools/call_frequency": 2.5, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.8, "completions/max_terminated_length": 152.8, "completions/mean_length": 131.3, "completions/mean_terminated_length": 131.3, "completions/min_length": 113.6, "completions/min_terminated_length": 113.6, "entropy": 0.016039706021547317, "epoch": 0.09375, "frac_reward_zero_std": 0.4, "grad_norm": 1.1289054155349731, "kl": 0.06640795171260834, "learning_rate": 4.7200000000000005e-06, "loss": 0.04752160608768463, "num_tokens": 45857.0, "reward": 1.1023000121116637, "reward_std": 0.4320605039596558, "rewards/reward_func/mean": 1.1023000121116637, "rewards/reward_func/std": 0.43206052780151366, "step": 15, "step_time": 8.620344271202338, "tools/call_frequency": 2.35, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 136.55, "completions/mean_terminated_length": 137.2500030517578, "completions/min_length": 111.4, "completions/min_terminated_length": 111.4, "entropy": 0.027425602450966834, "epoch": 0.125, "frac_reward_zero_std": 0.6, "grad_norm": 0.8991426229476929, "kl": 0.09577701878733932, "learning_rate": 4.620000000000001e-06, "loss": -0.1201351523399353, "num_tokens": 60826.0, "reward": 0.7200000047683716, "reward_std": 0.3419178485870361, "rewards/reward_func/mean": 0.7200000047683716, "rewards/reward_func/std": 0.3419178485870361, "step": 20, "step_time": 11.403528443601681, "tools/call_frequency": 2.05, "tools/failure_frequency": 0.026666668057441712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.35, "completions/max_length": 203.2, "completions/max_terminated_length": 161.2, "completions/mean_length": 188.6, "completions/mean_terminated_length": 150.6666687011719, "completions/min_length": 173.4, "completions/min_terminated_length": 139.6, "entropy": 0.033282498246990144, "epoch": 0.15625, "frac_reward_zero_std": 0.0, "grad_norm": 2.041987657546997, "kl": 0.051508421916514634, "learning_rate": 4.520000000000001e-06, "loss": 0.03198407888412476, "num_tokens": 76838.0, "reward": 1.2669333696365357, "reward_std": 0.3234894543886185, "rewards/reward_func/mean": 1.2669333696365357, "rewards/reward_func/std": 0.32348946332931516, "step": 25, "step_time": 13.736867211584467, "tools/call_frequency": 4.4, "tools/failure_frequency": 0.14583333432674409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65, "completions/max_length": 209.4, "completions/max_terminated_length": 160.8, "completions/mean_length": 192.45, "completions/mean_terminated_length": 148.3, "completions/min_length": 169.2, "completions/min_terminated_length": 135.8, "entropy": 0.04025774166220799, "epoch": 0.1875, "frac_reward_zero_std": 0.4, "grad_norm": 1.6383038759231567, "kl": 0.09242036554496735, "learning_rate": 4.42e-06, "loss": -0.03659022152423859, "num_tokens": 93054.0, "reward": 1.0333500146865844, "reward_std": 0.38981522917747496, "rewards/reward_func/mean": 1.0333500146865844, "rewards/reward_func/std": 0.389815217256546, "step": 30, "step_time": 14.735964270806289, "tools/call_frequency": 3.85, "tools/failure_frequency": 0.023529411852359773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 207.4, "completions/max_terminated_length": 168.8, "completions/mean_length": 196.4, "completions/mean_terminated_length": 166.7, "completions/min_length": 191.2, "completions/min_terminated_length": 164.6, "entropy": 0.02646293715806678, "epoch": 0.21875, "frac_reward_zero_std": 0.2, "grad_norm": 0.6842532157897949, "kl": 0.09354882184416055, "learning_rate": 4.32e-06, "loss": 0.014650090038776398, "num_tokens": 109141.0, "reward": 1.0134333491325378, "reward_std": 0.28623148798942566, "rewards/reward_func/mean": 1.0134333491325378, "rewards/reward_func/std": 0.2862314820289612, "step": 35, "step_time": 14.25194917320332, "tools/call_frequency": 3.95, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 228.8, "completions/max_terminated_length": 163.6, "completions/mean_length": 207.8, "completions/mean_terminated_length": 162.3, "completions/min_length": 188.8, "completions/min_terminated_length": 161.0, "entropy": 0.049686831969302146, "epoch": 0.25, "frac_reward_zero_std": 0.4, "grad_norm": 2.386836528778076, "kl": 0.12552661653608083, "learning_rate": 4.22e-06, "loss": 0.023246073722839357, "num_tokens": 125712.0, "reward": 0.9764333426952362, "reward_std": 0.3545127585530281, "rewards/reward_func/mean": 0.9764333426952362, "rewards/reward_func/std": 0.35451277494430544, "step": 40, "step_time": 16.735324517198023, "tools/call_frequency": 3.45, "tools/failure_frequency": 0.02857142984867096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 231.4, "completions/max_terminated_length": 127.2, "completions/mean_length": 209.4, "completions/mean_terminated_length": 124.2, "completions/min_length": 187.2, "completions/min_terminated_length": 121.2, "entropy": 0.14096241008955984, "epoch": 0.28125, "frac_reward_zero_std": 0.2, "grad_norm": 5.072839260101318, "kl": 0.10897002797573804, "learning_rate": 4.12e-06, "loss": 0.05337468385696411, "num_tokens": 142131.0, "reward": 1.0291000008583069, "reward_std": 0.5297403573989868, "rewards/reward_func/mean": 1.0291000008583069, "rewards/reward_func/std": 0.5297403573989868, "step": 45, "step_time": 17.371078941601446, "tools/call_frequency": 3.4, "tools/failure_frequency": 0.01428571492433548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 237.2, "completions/max_terminated_length": 172.2, "completions/mean_length": 201.7, "completions/mean_terminated_length": 160.46666870117187, "completions/min_length": 186.0, "completions/min_terminated_length": 150.8, "entropy": 0.1540619947016239, "epoch": 0.3125, "frac_reward_zero_std": 0.0, "grad_norm": 5.2555952072143555, "kl": 0.17082785218954086, "learning_rate": 4.0200000000000005e-06, "loss": 0.06733548641204834, "num_tokens": 158431.0, "reward": 0.8427666783332824, "reward_std": 0.6860074520111084, "rewards/reward_func/mean": 0.8427666783332824, "rewards/reward_func/std": 0.6860074281692505, "step": 50, "step_time": 17.60776922639343, "tools/call_frequency": 3.6, "tools/failure_frequency": 0.027619048953056335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55, "completions/max_length": 209.2, "completions/max_terminated_length": 209.2, "completions/mean_length": 198.0, "completions/mean_terminated_length": 206.43333435058594, "completions/min_length": 189.4, "completions/min_terminated_length": 203.8, "entropy": 0.11417091116309167, "epoch": 0.34375, "frac_reward_zero_std": 0.8, "grad_norm": 0.1612984985113144, "kl": 0.14481508396565915, "learning_rate": 3.920000000000001e-06, "loss": -0.0013940947130322457, "num_tokens": 174665.0, "reward": 1.337833333015442, "reward_std": 0.04058598577976227, "rewards/reward_func/mean": 1.337833333015442, "rewards/reward_func/std": 0.04058598577976227, "step": 55, "step_time": 13.894916865596315, "tools/call_frequency": 3.9, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.35, "completions/max_length": 210.6, "completions/max_terminated_length": 210.0, "completions/mean_length": 189.7, "completions/mean_terminated_length": 184.93333435058594, "completions/min_length": 152.0, "completions/min_terminated_length": 153.4, "entropy": 0.18207021439447998, "epoch": 0.375, "frac_reward_zero_std": 0.2, "grad_norm": 7.57163667678833, "kl": 0.2769763808697462, "learning_rate": 3.820000000000001e-06, "loss": -0.08738029599189759, "num_tokens": 190974.0, "reward": 0.9539999723434448, "reward_std": 0.24900673925876618, "rewards/reward_func/mean": 0.9539999723434448, "rewards/reward_func/std": 0.2490067459642887, "step": 60, "step_time": 13.735741792595945, "tools/call_frequency": 3.35, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55, "completions/max_length": 213.6, "completions/max_terminated_length": 186.6, "completions/mean_length": 196.15, "completions/mean_terminated_length": 181.2, "completions/min_length": 173.2, "completions/min_terminated_length": 177.0, "entropy": 0.18931779703125357, "epoch": 0.40625, "frac_reward_zero_std": 0.2, "grad_norm": 0.3368631601333618, "kl": 0.19928277991712093, "learning_rate": 3.7200000000000004e-06, "loss": -0.03082091510295868, "num_tokens": 207221.0, "reward": 1.1948333382606506, "reward_std": 0.3531351625919342, "rewards/reward_func/mean": 1.1948333382606506, "rewards/reward_func/std": 0.3531351566314697, "step": 65, "step_time": 14.853071747999639, "tools/call_frequency": 3.45, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 232.4, "completions/max_terminated_length": 231.4, "completions/mean_length": 214.8, "completions/mean_terminated_length": 221.10000305175782, "completions/min_length": 199.4, "completions/min_terminated_length": 211.6, "entropy": 0.20331259737722576, "epoch": 0.4375, "frac_reward_zero_std": 0.6, "grad_norm": 3.155299663543701, "kl": 0.21616111248731612, "learning_rate": 3.62e-06, "loss": -0.014388753473758698, "num_tokens": 223949.0, "reward": 1.187999999523163, "reward_std": 0.06400000005960464, "rewards/reward_func/mean": 1.187999999523163, "rewards/reward_func/std": 0.06399999856948853, "step": 70, "step_time": 15.7972018689994, "tools/call_frequency": 3.3, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 228.0, "completions/max_terminated_length": 180.2, "completions/mean_length": 217.2, "completions/mean_terminated_length": 174.9, "completions/min_length": 208.2, "completions/min_terminated_length": 169.6, "entropy": 0.09874274502508343, "epoch": 0.46875, "frac_reward_zero_std": 0.4, "grad_norm": 0.1496252417564392, "kl": 0.19251887053251265, "learning_rate": 3.52e-06, "loss": 0.0129203662276268, "num_tokens": 240663.0, "reward": 1.166100013256073, "reward_std": 0.27513332962989806, "rewards/reward_func/mean": 1.166100013256073, "rewards/reward_func/std": 0.275133341550827, "step": 75, "step_time": 15.653593644002104, "tools/call_frequency": 3.15, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4, "completions/max_length": 250.6, "completions/max_terminated_length": 235.4, "completions/mean_length": 217.2, "completions/mean_terminated_length": 209.23333740234375, "completions/min_length": 169.8, "completions/min_terminated_length": 174.0, "entropy": 0.18624852728098631, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.9008662700653076, "kl": 0.19779104925692081, "learning_rate": 3.4200000000000007e-06, "loss": -0.060715597867965695, "num_tokens": 257232.0, "reward": 1.094600009918213, "reward_std": 0.533681058883667, "rewards/reward_func/mean": 1.094600009918213, "rewards/reward_func/std": 0.5336810708045959, "step": 80, "step_time": 16.87674882839783, "tools/call_frequency": 2.7, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 263.2, "completions/max_terminated_length": 149.0, "completions/mean_length": 238.9, "completions/mean_terminated_length": 138.2, "completions/min_length": 220.0, "completions/min_terminated_length": 127.4, "entropy": 0.06761846686713398, "epoch": 0.53125, "frac_reward_zero_std": 0.6, "grad_norm": 0.0583312027156353, "kl": 0.16298045124858618, "learning_rate": 3.3200000000000004e-06, "loss": 0.0317715585231781, "num_tokens": 274377.0, "reward": 1.168333351612091, "reward_std": 0.21399999260902405, "rewards/reward_func/mean": 1.168333351612091, "rewards/reward_func/std": 0.214000004529953, "step": 85, "step_time": 19.253501980405417, "tools/call_frequency": 2.45, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9, "completions/max_length": 252.0, "completions/max_terminated_length": 79.6, "completions/mean_length": 229.35, "completions/mean_terminated_length": 79.6, "completions/min_length": 212.6, "completions/min_terminated_length": 79.6, "entropy": 0.04304317501373589, "epoch": 0.5625, "frac_reward_zero_std": 0.4, "grad_norm": 0.05765737593173981, "kl": 0.1589741975069046, "learning_rate": 3.2200000000000005e-06, "loss": -0.009884151071310044, "num_tokens": 291640.0, "reward": 1.0771000266075135, "reward_std": 0.2571271777153015, "rewards/reward_func/mean": 1.0771000266075135, "rewards/reward_func/std": 0.257127183675766, "step": 90, "step_time": 19.810263851404308, "tools/call_frequency": 2.7, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 252.2, "completions/max_terminated_length": 155.6, "completions/mean_length": 227.45, "completions/mean_terminated_length": 154.7, "completions/min_length": 202.6, "completions/min_terminated_length": 153.8, "entropy": 0.03856636304408312, "epoch": 0.59375, "frac_reward_zero_std": 0.2, "grad_norm": 2.2899415493011475, "kl": 0.18391469195485116, "learning_rate": 3.12e-06, "loss": 0.012278559803962707, "num_tokens": 308671.0, "reward": 0.9493666887283325, "reward_std": 0.3057107627391815, "rewards/reward_func/mean": 0.9493666887283325, "rewards/reward_func/std": 0.3057107746601105, "step": 95, "step_time": 18.270148772597896, "tools/call_frequency": 2.75, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65, "completions/max_length": 229.0, "completions/max_terminated_length": 128.6, "completions/mean_length": 210.05, "completions/mean_terminated_length": 121.23333435058593, "completions/min_length": 191.4, "completions/min_terminated_length": 114.6, "entropy": 0.03718785918317735, "epoch": 0.625, "frac_reward_zero_std": 0.2, "grad_norm": 1.4016427993774414, "kl": 0.19276840873062612, "learning_rate": 3.0200000000000003e-06, "loss": -0.02043401300907135, "num_tokens": 325246.0, "reward": 0.9758000135421753, "reward_std": 0.439729905128479, "rewards/reward_func/mean": 0.9758000135421753, "rewards/reward_func/std": 0.439729905128479, "step": 100, "step_time": 16.536685503809714, "tools/call_frequency": 3.4, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 231.0, "completions/max_terminated_length": 175.6, "completions/mean_length": 209.7, "completions/mean_terminated_length": 164.73333435058595, "completions/min_length": 197.2, "completions/min_terminated_length": 156.2, "entropy": 0.0890876273624599, "epoch": 0.65625, "frac_reward_zero_std": 0.6, "grad_norm": 1.857412338256836, "kl": 0.20793221928179265, "learning_rate": 2.92e-06, "loss": 0.010671529173851012, "num_tokens": 341743.0, "reward": 1.244200015068054, "reward_std": 0.25437753796577456, "rewards/reward_func/mean": 1.244200015068054, "rewards/reward_func/std": 0.25437754988670347, "step": 105, "step_time": 14.550393618003, "tools/call_frequency": 3.4, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 228.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 211.05, "completions/mean_terminated_length": 204.6, "completions/min_length": 194.2, "completions/min_terminated_length": 194.2, "entropy": 0.09650332322344184, "epoch": 0.6875, "frac_reward_zero_std": 0.0, "grad_norm": 0.5915409922599792, "kl": 0.1943995427340269, "learning_rate": 2.82e-06, "loss": -0.007803649455308914, "num_tokens": 358474.0, "reward": 0.9213667035102844, "reward_std": 0.48010437488555907, "rewards/reward_func/mean": 0.9213667035102844, "rewards/reward_func/std": 0.480104398727417, "step": 110, "step_time": 15.29034832160105, "tools/call_frequency": 3.3, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 244.8, "completions/max_terminated_length": 244.8, "completions/mean_length": 224.4, "completions/mean_terminated_length": 232.93333435058594, "completions/min_length": 204.4, "completions/min_terminated_length": 219.8, "entropy": 0.06257005939260125, "epoch": 0.71875, "frac_reward_zero_std": 0.2, "grad_norm": 0.09643584489822388, "kl": 0.18671961799263953, "learning_rate": 2.7200000000000002e-06, "loss": 0.0009367348626255989, "num_tokens": 375512.0, "reward": 0.9198000192642212, "reward_std": 0.41239041090011597, "rewards/reward_func/mean": 0.9198000192642212, "rewards/reward_func/std": 0.41239042282104493, "step": 115, "step_time": 16.68962257000094, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 238.4, "completions/max_terminated_length": 194.2, "completions/mean_length": 221.6, "completions/mean_terminated_length": 185.86666870117188, "completions/min_length": 202.6, "completions/min_terminated_length": 174.8, "entropy": 0.19831047160550952, "epoch": 0.75, "frac_reward_zero_std": 0.4, "grad_norm": 0.06480103731155396, "kl": 0.2127195455133915, "learning_rate": 2.6200000000000003e-06, "loss": -0.002893347479403019, "num_tokens": 392259.0, "reward": 1.1177000164985658, "reward_std": 0.34459384679794314, "rewards/reward_func/mean": 1.1177000164985658, "rewards/reward_func/std": 0.34459385871887205, "step": 120, "step_time": 15.74592421480629, "tools/call_frequency": 3.1, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55, "completions/max_length": 249.6, "completions/max_terminated_length": 248.4, "completions/mean_length": 225.9, "completions/mean_terminated_length": 238.83333435058594, "completions/min_length": 207.6, "completions/min_terminated_length": 229.0, "entropy": 0.12472135615535081, "epoch": 0.78125, "frac_reward_zero_std": 0.2, "grad_norm": 1.3778189420700073, "kl": 0.22096077986061574, "learning_rate": 2.52e-06, "loss": 0.018771570920944215, "num_tokens": 409108.0, "reward": 0.6021333426237107, "reward_std": 0.6133833765983582, "rewards/reward_func/mean": 0.6021333426237107, "rewards/reward_func/std": 0.6133833885192871, "step": 125, "step_time": 17.135429813191877, "tools/call_frequency": 3.0, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 242.2, "completions/max_terminated_length": 214.8, "completions/mean_length": 219.0, "completions/mean_terminated_length": 209.9, "completions/min_length": 193.2, "completions/min_terminated_length": 203.6, "entropy": 0.15268438071943818, "epoch": 0.8125, "frac_reward_zero_std": 0.2, "grad_norm": 2.5108072757720947, "kl": 0.2319534882903099, "learning_rate": 2.42e-06, "loss": -0.013115590810775757, "num_tokens": 425748.0, "reward": 0.9733999967575073, "reward_std": 0.6780932426452637, "rewards/reward_func/mean": 0.9733999967575073, "rewards/reward_func/std": 0.6780932545661926, "step": 130, "step_time": 16.86904642219888, "tools/call_frequency": 2.9, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55, "completions/max_length": 238.2, "completions/max_terminated_length": 194.0, "completions/mean_length": 225.05, "completions/mean_terminated_length": 187.35, "completions/min_length": 212.8, "completions/min_terminated_length": 179.6, "entropy": 0.11236475300975143, "epoch": 0.84375, "frac_reward_zero_std": 0.6, "grad_norm": 2.316483736038208, "kl": 0.18593905940651895, "learning_rate": 2.3200000000000002e-06, "loss": 0.003791916370391846, "num_tokens": 442384.0, "reward": 1.3600000143051147, "reward_std": 0.12066323161125184, "rewards/reward_func/mean": 1.3600000143051147, "rewards/reward_func/std": 0.12066323161125184, "step": 135, "step_time": 16.34641739600629, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65, "completions/max_length": 242.6, "completions/max_terminated_length": 192.6, "completions/mean_length": 225.85, "completions/mean_terminated_length": 187.1, "completions/min_length": 212.6, "completions/min_terminated_length": 181.6, "entropy": 0.29790011094883084, "epoch": 0.875, "frac_reward_zero_std": 0.2, "grad_norm": 4.7559309005737305, "kl": 0.25753427743911744, "learning_rate": 2.2200000000000003e-06, "loss": 0.003042304515838623, "num_tokens": 459098.0, "reward": 0.8764000177383423, "reward_std": 0.408263224363327, "rewards/reward_func/mean": 0.8764000177383423, "rewards/reward_func/std": 0.408263236284256, "step": 140, "step_time": 17.673848899203584, "tools/call_frequency": 2.9, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 234.8, "completions/max_terminated_length": 140.4, "completions/mean_length": 218.5, "completions/mean_terminated_length": 135.4, "completions/min_length": 204.4, "completions/min_terminated_length": 130.4, "entropy": 0.49127169298008083, "epoch": 0.90625, "frac_reward_zero_std": 0.6, "grad_norm": 0.07470813393592834, "kl": 0.2544811189174652, "learning_rate": 2.12e-06, "loss": 0.011104442924261094, "num_tokens": 475747.0, "reward": 1.2128000020980836, "reward_std": 0.3457141280174255, "rewards/reward_func/mean": 1.2128000020980836, "rewards/reward_func/std": 0.3457141280174255, "step": 145, "step_time": 17.512910800203098, "tools/call_frequency": 3.1, "tools/failure_frequency": 0.01428571492433548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 238.4, "completions/max_terminated_length": 187.0, "completions/mean_length": 219.55, "completions/mean_terminated_length": 176.0, "completions/min_length": 203.0, "completions/min_terminated_length": 163.4, "entropy": 0.3977989808190614, "epoch": 0.9375, "frac_reward_zero_std": 0.2, "grad_norm": 1.4591357707977295, "kl": 0.26335713379085063, "learning_rate": 2.02e-06, "loss": 0.013294479250907898, "num_tokens": 492437.0, "reward": 0.9657333374023438, "reward_std": 0.5567267656326294, "rewards/reward_func/mean": 0.9657333374023438, "rewards/reward_func/std": 0.5567267417907715, "step": 150, "step_time": 17.259096857812256, "tools/call_frequency": 3.15, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 232.2, "completions/max_terminated_length": 222.4, "completions/mean_length": 209.05, "completions/mean_terminated_length": 207.03333435058593, "completions/min_length": 176.2, "completions/min_terminated_length": 193.2, "entropy": 0.5087961174082011, "epoch": 0.96875, "frac_reward_zero_std": 0.4, "grad_norm": 3.283621311187744, "kl": 0.28082513697445394, "learning_rate": 1.9200000000000003e-06, "loss": -0.025146692991256714, "num_tokens": 508782.0, "reward": 1.0452000081539154, "reward_std": 0.44076991081237793, "rewards/reward_func/mean": 1.0452000081539154, "rewards/reward_func/std": 0.4407699227333069, "step": 155, "step_time": 16.901877696395967, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 237.6, "completions/max_terminated_length": 182.8, "completions/mean_length": 219.25, "completions/mean_terminated_length": 182.6, "completions/min_length": 204.8, "completions/min_terminated_length": 182.4, "entropy": 0.6035067357588559, "epoch": 1.0, "frac_reward_zero_std": 0.2, "grad_norm": 3.8756589889526367, "kl": 0.27896949015557765, "learning_rate": 1.8200000000000002e-06, "loss": 0.018515169620513916, "num_tokens": 525445.0, "reward": 0.8603333592414856, "reward_std": 0.4929263710975647, "rewards/reward_func/mean": 0.8603333592414856, "rewards/reward_func/std": 0.4929263830184937, "step": 160, "step_time": 18.22537782279833, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 226.4, "completions/max_terminated_length": 132.6, "completions/mean_length": 208.4, "completions/mean_terminated_length": 113.8, "completions/min_length": 178.0, "completions/min_terminated_length": 95.0, "entropy": 0.81973907337524, "epoch": 1.03125, "frac_reward_zero_std": 0.4, "grad_norm": 0.23878704011440277, "kl": 0.28968340829014777, "learning_rate": 1.72e-06, "loss": -0.01732647567987442, "num_tokens": 541725.0, "reward": 0.801800012588501, "reward_std": 0.3781105220317841, "rewards/reward_func/mean": 0.801800012588501, "rewards/reward_func/std": 0.3781105220317841, "step": 165, "step_time": 17.06264049640158, "tools/call_frequency": 3.0, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 234.2, "completions/max_terminated_length": 156.0, "completions/mean_length": 210.4, "completions/mean_terminated_length": 148.5, "completions/min_length": 176.8, "completions/min_terminated_length": 141.0, "entropy": 0.689805658115074, "epoch": 1.0625, "frac_reward_zero_std": 0.2, "grad_norm": 4.456592082977295, "kl": 0.23804183304309845, "learning_rate": 1.6200000000000002e-06, "loss": -0.05976427793502807, "num_tokens": 558245.0, "reward": 0.8562000155448913, "reward_std": 0.633456540107727, "rewards/reward_func/mean": 0.8562000155448913, "rewards/reward_func/std": 0.633456540107727, "step": 170, "step_time": 18.225926614992204, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.018181818723678588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 226.2, "completions/max_terminated_length": 168.2, "completions/mean_length": 210.85, "completions/mean_terminated_length": 160.45, "completions/min_length": 196.2, "completions/min_terminated_length": 155.4, "entropy": 0.5438663156237453, "epoch": 1.09375, "frac_reward_zero_std": 0.2, "grad_norm": 4.7369794845581055, "kl": 0.27658827155828475, "learning_rate": 1.52e-06, "loss": 0.017269280552864075, "num_tokens": 574485.0, "reward": 1.1061000227928162, "reward_std": 0.29246323108673095, "rewards/reward_func/mean": 1.1061000227928162, "rewards/reward_func/std": 0.29246323108673095, "step": 175, "step_time": 15.657095247798134, "tools/call_frequency": 3.25, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 227.6, "completions/max_terminated_length": 153.8, "completions/mean_length": 210.05, "completions/mean_terminated_length": 150.3, "completions/min_length": 182.4, "completions/min_terminated_length": 146.8, "entropy": 0.45502517083659766, "epoch": 1.125, "frac_reward_zero_std": 0.2, "grad_norm": 2.233670473098755, "kl": 0.24543451368808747, "learning_rate": 1.42e-06, "loss": -0.03468523025512695, "num_tokens": 591329.0, "reward": 0.9866333484649659, "reward_std": 0.43823108077049255, "rewards/reward_func/mean": 0.9866333484649659, "rewards/reward_func/std": 0.43823106288909913, "step": 180, "step_time": 16.63430393260496, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.018181818723678588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 238.8, "completions/max_terminated_length": 235.8, "completions/mean_length": 214.35, "completions/mean_terminated_length": 215.93333740234374, "completions/min_length": 179.6, "completions/min_terminated_length": 192.4, "entropy": 0.3232035285793245, "epoch": 1.15625, "frac_reward_zero_std": 0.2, "grad_norm": 0.042836207896471024, "kl": 0.2335926942527294, "learning_rate": 1.32e-06, "loss": -0.053522664308547976, "num_tokens": 607959.0, "reward": 1.1267000198364259, "reward_std": 0.5779333353042603, "rewards/reward_func/mean": 1.1267000198364259, "rewards/reward_func/std": 0.5779333412647247, "step": 185, "step_time": 16.85379633680277, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55, "completions/max_length": 234.8, "completions/max_terminated_length": 162.2, "completions/mean_length": 214.2, "completions/mean_terminated_length": 153.75, "completions/min_length": 188.8, "completions/min_terminated_length": 147.4, "entropy": 0.6248208525590598, "epoch": 1.1875, "frac_reward_zero_std": 0.0, "grad_norm": 3.5707623958587646, "kl": 0.26286336220800877, "learning_rate": 1.2200000000000002e-06, "loss": -0.04159345626831055, "num_tokens": 624606.0, "reward": 0.7690666794776917, "reward_std": 0.5251105308532715, "rewards/reward_func/mean": 0.7690666794776917, "rewards/reward_func/std": 0.5251105427742004, "step": 190, "step_time": 15.895430696196854, "tools/call_frequency": 2.95, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 231.2, "completions/max_terminated_length": 140.8, "completions/mean_length": 212.5, "completions/mean_terminated_length": 130.53333435058593, "completions/min_length": 182.6, "completions/min_terminated_length": 113.6, "entropy": 1.0036215352360158, "epoch": 1.21875, "frac_reward_zero_std": 0.4, "grad_norm": 0.07126332074403763, "kl": 0.2945852160453796, "learning_rate": 1.12e-06, "loss": 0.08146535158157349, "num_tokens": 641178.0, "reward": 0.7420000076293946, "reward_std": 0.2832352787256241, "rewards/reward_func/mean": 0.7420000076293946, "rewards/reward_func/std": 0.28323529064655306, "step": 195, "step_time": 16.828464100402197, "tools/call_frequency": 2.9, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 228.2, "completions/max_terminated_length": 139.8, "completions/mean_length": 217.85, "completions/mean_terminated_length": 138.8, "completions/min_length": 205.4, "completions/min_terminated_length": 137.8, "entropy": 0.7422479507047683, "epoch": 1.25, "frac_reward_zero_std": 0.0, "grad_norm": 4.2798051834106445, "kl": 0.3032657243311405, "learning_rate": 1.02e-06, "loss": 0.017710180580616, "num_tokens": 657673.0, "reward": 0.8391000092029571, "reward_std": 0.7726872444152832, "rewards/reward_func/mean": 0.8391000092029571, "rewards/reward_func/std": 0.7726872682571411, "step": 200, "step_time": 17.576660793198972, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 221.8, "completions/max_terminated_length": 146.4, "completions/mean_length": 207.7, "completions/mean_terminated_length": 143.7, "completions/min_length": 176.2, "completions/min_terminated_length": 141.0, "entropy": 0.28722639731131494, "epoch": 1.28125, "frac_reward_zero_std": 0.8, "grad_norm": 0.05100347474217415, "kl": 0.23604922145605087, "learning_rate": 9.200000000000001e-07, "loss": -0.04504288733005524, "num_tokens": 673877.0, "reward": 1.34099999666214, "reward_std": 0.09399999976158142, "rewards/reward_func/mean": 1.34099999666214, "rewards/reward_func/std": 0.09399999976158142, "step": 205, "step_time": 17.11068033759657, "tools/call_frequency": 2.95, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 238.2, "completions/max_terminated_length": 176.8, "completions/mean_length": 221.0, "completions/mean_terminated_length": 175.9, "completions/min_length": 206.6, "completions/min_terminated_length": 175.0, "entropy": 0.7828503699507564, "epoch": 1.3125, "frac_reward_zero_std": 0.4, "grad_norm": 3.9798154830932617, "kl": 0.2740111470222473, "learning_rate": 8.200000000000001e-07, "loss": 0.008357369899749756, "num_tokens": 690789.0, "reward": 1.0448000192642213, "reward_std": 0.33592903017997744, "rewards/reward_func/mean": 1.0448000192642213, "rewards/reward_func/std": 0.33592904210090635, "step": 210, "step_time": 16.50285193720192, "tools/call_frequency": 3.0, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 234.6, "completions/max_terminated_length": 103.8, "completions/mean_length": 204.35, "completions/mean_terminated_length": 91.8, "completions/min_length": 156.4, "completions/min_terminated_length": 79.8, "entropy": 0.8003528213594109, "epoch": 1.34375, "frac_reward_zero_std": 0.0, "grad_norm": 4.159558296203613, "kl": 0.2852329473942518, "learning_rate": 7.2e-07, "loss": -0.08619436025619506, "num_tokens": 707222.0, "reward": 1.0130333423614502, "reward_std": 0.6909683525562287, "rewards/reward_func/mean": 1.0130333423614502, "rewards/reward_func/std": 0.6909683525562287, "step": 215, "step_time": 17.172416884609266, "tools/call_frequency": 2.75, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 227.6, "completions/max_terminated_length": 194.0, "completions/mean_length": 210.55, "completions/mean_terminated_length": 191.4, "completions/min_length": 179.4, "completions/min_terminated_length": 188.8, "entropy": 0.5576354823075235, "epoch": 1.375, "frac_reward_zero_std": 0.4, "grad_norm": 3.594050168991089, "kl": 0.278278523683548, "learning_rate": 6.200000000000001e-07, "loss": -0.04620848298072815, "num_tokens": 723693.0, "reward": 1.0568666815757752, "reward_std": 0.4007272839546204, "rewards/reward_func/mean": 1.0568666815757752, "rewards/reward_func/std": 0.4007272839546204, "step": 220, "step_time": 16.7619311846036, "tools/call_frequency": 2.9, "tools/failure_frequency": 0.01666666716337204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 238.8, "completions/max_terminated_length": 201.4, "completions/mean_length": 202.6, "completions/mean_terminated_length": 174.96666870117187, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.8566485311836004, "epoch": 1.40625, "frac_reward_zero_std": 0.0, "grad_norm": 4.667319297790527, "kl": 0.2987564954906702, "learning_rate": 5.2e-07, "loss": -0.07293931245803834, "num_tokens": 740154.0, "reward": 0.9942000150680542, "reward_std": 0.6604385733604431, "rewards/reward_func/mean": 0.9942000150680542, "rewards/reward_func/std": 0.6604385554790497, "step": 225, "step_time": 16.59093914159166, "tools/call_frequency": 2.9, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 230.2, "completions/max_terminated_length": 118.0, "completions/mean_length": 205.65, "completions/mean_terminated_length": 118.0, "completions/min_length": 155.0, "completions/min_terminated_length": 118.0, "entropy": 0.6852329360786825, "epoch": 1.4375, "frac_reward_zero_std": 0.6, "grad_norm": 0.13374075293540955, "kl": 0.27822317034006117, "learning_rate": 4.2000000000000006e-07, "loss": -0.07140348553657531, "num_tokens": 756746.0, "reward": 1.137933337688446, "reward_std": 0.24845077395439147, "rewards/reward_func/mean": 1.137933337688446, "rewards/reward_func/std": 0.24845077395439147, "step": 230, "step_time": 16.51037070681341, "tools/call_frequency": 2.65, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 229.8, "completions/max_terminated_length": 215.8, "completions/mean_length": 214.75, "completions/mean_terminated_length": 210.7, "completions/min_length": 204.4, "completions/min_terminated_length": 206.6, "entropy": 0.8697535959538072, "epoch": 1.46875, "frac_reward_zero_std": 0.2, "grad_norm": 2.334895133972168, "kl": 0.2648452676832676, "learning_rate": 3.2e-07, "loss": 0.014858978986740112, "num_tokens": 773619.0, "reward": 0.7043333411216736, "reward_std": 0.402424693107605, "rewards/reward_func/mean": 0.7043333411216736, "rewards/reward_func/std": 0.402424693107605, "step": 235, "step_time": 16.278191728607634, "tools/call_frequency": 3.2, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 234.0, "completions/max_terminated_length": 148.6, "completions/mean_length": 207.45, "completions/mean_terminated_length": 142.8, "completions/min_length": 165.0, "completions/min_terminated_length": 137.0, "entropy": 1.037268871301785, "epoch": 1.5, "frac_reward_zero_std": 0.0, "grad_norm": 5.430893421173096, "kl": 0.2893723286688328, "learning_rate": 2.2e-07, "loss": -0.0661659598350525, "num_tokens": 789973.0, "reward": 0.7955000281333924, "reward_std": 0.6784387767314911, "rewards/reward_func/mean": 0.7955000281333924, "rewards/reward_func/std": 0.6784387946128845, "step": 240, "step_time": 17.391802603405086, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 231.2, "completions/max_terminated_length": 138.6, "completions/mean_length": 219.2, "completions/mean_terminated_length": 136.3, "completions/min_length": 211.2, "completions/min_terminated_length": 134.0, "entropy": 0.47988032912835477, "epoch": 1.53125, "frac_reward_zero_std": 0.6, "grad_norm": 0.18447190523147583, "kl": 0.23510150127112867, "learning_rate": 1.2000000000000002e-07, "loss": 0.005726324021816253, "num_tokens": 806865.0, "reward": 1.1196000099182128, "reward_std": 0.15212990045547486, "rewards/reward_func/mean": 1.1196000099182128, "rewards/reward_func/std": 0.15212990045547486, "step": 245, "step_time": 17.48798955640523, "tools/call_frequency": 3.0, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 224.4, "completions/max_terminated_length": 170.0, "completions/mean_length": 210.15, "completions/mean_terminated_length": 167.0, "completions/min_length": 194.4, "completions/min_terminated_length": 164.0, "entropy": 0.7179721655789763, "epoch": 1.5625, "frac_reward_zero_std": 0.2, "grad_norm": 4.354219436645508, "kl": 0.26825664229691026, "learning_rate": 2e-08, "loss": 0.0007836699485778808, "num_tokens": 823319.0, "reward": 0.9574000000953674, "reward_std": 0.6371999979019165, "rewards/reward_func/mean": 0.9574000000953674, "rewards/reward_func/std": 0.6371999979019165, "step": 250, "step_time": 17.161368461008532, "tools/call_frequency": 3.3, "tools/failure_frequency": 0.0 } ], "logging_steps": 5, "max_steps": 250, "num_input_tokens_seen": 823319, "num_train_epochs": 2, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }