{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.78125, "eval_steps": 500, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 199.6, "completions/max_terminated_length": 134.0, "completions/mean_length": 171.9, "completions/mean_terminated_length": 122.36666870117188, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06970996516756714, "epoch": 0.03125, "frac_reward_zero_std": 0.6, "grad_norm": 3.630038261413574, "kl": 0.00014932112862879875, "learning_rate": 4.92e-06, "loss": 0.029165178537368774, "num_tokens": 15758.0, "reward": -0.31389998495578764, "reward_std": 0.2122000053524971, "rewards/reward_func/mean": -0.31389998495578764, "rewards/reward_func/std": 0.21219999492168426, "step": 5, "step_time": 14.728857926794444, "tools/call_frequency": 3.45, "tools/failure_frequency": 0.21573015451431274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.6, "completions/max_terminated_length": 168.6, "completions/mean_length": 148.3, "completions/mean_terminated_length": 148.3, "completions/min_length": 129.6, "completions/min_terminated_length": 129.6, "entropy": 0.042718362715095284, "epoch": 0.0625, "frac_reward_zero_std": 0.2, "grad_norm": 3.325033187866211, "kl": 0.037860750965774057, "learning_rate": 4.8200000000000004e-06, "loss": -0.011221970617771148, "num_tokens": 31053.0, "reward": 0.2989000082015991, "reward_std": 0.4415143087506294, "rewards/reward_func/mean": 0.2989000082015991, "rewards/reward_func/std": 0.4415143221616745, "step": 10, "step_time": 9.975367512006779, "tools/call_frequency": 2.5, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.8, "completions/max_terminated_length": 152.8, "completions/mean_length": 131.3, "completions/mean_terminated_length": 131.3, "completions/min_length": 113.6, "completions/min_terminated_length": 113.6, "entropy": 0.016039706021547317, "epoch": 0.09375, "frac_reward_zero_std": 0.4, "grad_norm": 1.1289054155349731, "kl": 0.06640795171260834, "learning_rate": 4.7200000000000005e-06, "loss": 0.04752160608768463, "num_tokens": 45857.0, "reward": 1.1023000121116637, "reward_std": 0.4320605039596558, "rewards/reward_func/mean": 1.1023000121116637, "rewards/reward_func/std": 0.43206052780151366, "step": 15, "step_time": 8.620344271202338, "tools/call_frequency": 2.35, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 136.55, "completions/mean_terminated_length": 137.2500030517578, "completions/min_length": 111.4, "completions/min_terminated_length": 111.4, "entropy": 0.027425602450966834, "epoch": 0.125, "frac_reward_zero_std": 0.6, "grad_norm": 0.8991426229476929, "kl": 0.09577701878733932, "learning_rate": 4.620000000000001e-06, "loss": -0.1201351523399353, "num_tokens": 60826.0, "reward": 0.7200000047683716, "reward_std": 0.3419178485870361, "rewards/reward_func/mean": 0.7200000047683716, "rewards/reward_func/std": 0.3419178485870361, "step": 20, "step_time": 11.403528443601681, "tools/call_frequency": 2.05, "tools/failure_frequency": 0.026666668057441712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.35, "completions/max_length": 203.2, "completions/max_terminated_length": 161.2, "completions/mean_length": 188.6, "completions/mean_terminated_length": 150.6666687011719, "completions/min_length": 173.4, "completions/min_terminated_length": 139.6, "entropy": 0.033282498246990144, "epoch": 0.15625, "frac_reward_zero_std": 0.0, "grad_norm": 2.041987657546997, "kl": 0.051508421916514634, "learning_rate": 4.520000000000001e-06, "loss": 0.03198407888412476, "num_tokens": 76838.0, "reward": 1.2669333696365357, "reward_std": 0.3234894543886185, "rewards/reward_func/mean": 1.2669333696365357, "rewards/reward_func/std": 0.32348946332931516, "step": 25, "step_time": 13.736867211584467, "tools/call_frequency": 4.4, "tools/failure_frequency": 0.14583333432674409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65, "completions/max_length": 209.4, "completions/max_terminated_length": 160.8, "completions/mean_length": 192.45, "completions/mean_terminated_length": 148.3, "completions/min_length": 169.2, "completions/min_terminated_length": 135.8, "entropy": 0.04025774166220799, "epoch": 0.1875, "frac_reward_zero_std": 0.4, "grad_norm": 1.6383038759231567, "kl": 0.09242036554496735, "learning_rate": 4.42e-06, "loss": -0.03659022152423859, "num_tokens": 93054.0, "reward": 1.0333500146865844, "reward_std": 0.38981522917747496, "rewards/reward_func/mean": 1.0333500146865844, "rewards/reward_func/std": 0.389815217256546, "step": 30, "step_time": 14.735964270806289, "tools/call_frequency": 3.85, "tools/failure_frequency": 0.023529411852359773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 207.4, "completions/max_terminated_length": 168.8, "completions/mean_length": 196.4, "completions/mean_terminated_length": 166.7, "completions/min_length": 191.2, "completions/min_terminated_length": 164.6, "entropy": 0.02646293715806678, "epoch": 0.21875, "frac_reward_zero_std": 0.2, "grad_norm": 0.6842532157897949, "kl": 0.09354882184416055, "learning_rate": 4.32e-06, "loss": 0.014650090038776398, "num_tokens": 109141.0, "reward": 1.0134333491325378, "reward_std": 0.28623148798942566, "rewards/reward_func/mean": 1.0134333491325378, "rewards/reward_func/std": 0.2862314820289612, "step": 35, "step_time": 14.25194917320332, "tools/call_frequency": 3.95, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7, "completions/max_length": 228.8, "completions/max_terminated_length": 163.6, "completions/mean_length": 207.8, "completions/mean_terminated_length": 162.3, "completions/min_length": 188.8, "completions/min_terminated_length": 161.0, "entropy": 0.049686831969302146, "epoch": 0.25, "frac_reward_zero_std": 0.4, "grad_norm": 2.386836528778076, "kl": 0.12552661653608083, "learning_rate": 4.22e-06, "loss": 0.023246073722839357, "num_tokens": 125712.0, "reward": 0.9764333426952362, "reward_std": 0.3545127585530281, "rewards/reward_func/mean": 0.9764333426952362, "rewards/reward_func/std": 0.35451277494430544, "step": 40, "step_time": 16.735324517198023, "tools/call_frequency": 3.45, "tools/failure_frequency": 0.02857142984867096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 231.4, "completions/max_terminated_length": 127.2, "completions/mean_length": 209.4, "completions/mean_terminated_length": 124.2, "completions/min_length": 187.2, "completions/min_terminated_length": 121.2, "entropy": 0.14096241008955984, "epoch": 0.28125, "frac_reward_zero_std": 0.2, "grad_norm": 5.072839260101318, "kl": 0.10897002797573804, "learning_rate": 4.12e-06, "loss": 0.05337468385696411, "num_tokens": 142131.0, "reward": 1.0291000008583069, "reward_std": 0.5297403573989868, "rewards/reward_func/mean": 1.0291000008583069, "rewards/reward_func/std": 0.5297403573989868, "step": 45, "step_time": 17.371078941601446, "tools/call_frequency": 3.4, "tools/failure_frequency": 0.01428571492433548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 237.2, "completions/max_terminated_length": 172.2, "completions/mean_length": 201.7, "completions/mean_terminated_length": 160.46666870117187, "completions/min_length": 186.0, "completions/min_terminated_length": 150.8, "entropy": 0.1540619947016239, "epoch": 0.3125, "frac_reward_zero_std": 0.0, "grad_norm": 5.2555952072143555, "kl": 0.17082785218954086, "learning_rate": 4.0200000000000005e-06, "loss": 0.06733548641204834, "num_tokens": 158431.0, "reward": 0.8427666783332824, "reward_std": 0.6860074520111084, "rewards/reward_func/mean": 0.8427666783332824, "rewards/reward_func/std": 0.6860074281692505, "step": 50, "step_time": 17.60776922639343, "tools/call_frequency": 3.6, "tools/failure_frequency": 0.027619048953056335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55, "completions/max_length": 209.2, "completions/max_terminated_length": 209.2, "completions/mean_length": 198.0, "completions/mean_terminated_length": 206.43333435058594, "completions/min_length": 189.4, "completions/min_terminated_length": 203.8, "entropy": 0.11417091116309167, "epoch": 0.34375, "frac_reward_zero_std": 0.8, "grad_norm": 0.1612984985113144, "kl": 0.14481508396565915, "learning_rate": 3.920000000000001e-06, "loss": -0.0013940947130322457, "num_tokens": 174665.0, "reward": 1.337833333015442, "reward_std": 0.04058598577976227, "rewards/reward_func/mean": 1.337833333015442, "rewards/reward_func/std": 0.04058598577976227, "step": 55, "step_time": 13.894916865596315, "tools/call_frequency": 3.9, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.35, "completions/max_length": 210.6, "completions/max_terminated_length": 210.0, "completions/mean_length": 189.7, "completions/mean_terminated_length": 184.93333435058594, "completions/min_length": 152.0, "completions/min_terminated_length": 153.4, "entropy": 0.18207021439447998, "epoch": 0.375, "frac_reward_zero_std": 0.2, "grad_norm": 7.57163667678833, "kl": 0.2769763808697462, "learning_rate": 3.820000000000001e-06, "loss": -0.08738029599189759, "num_tokens": 190974.0, "reward": 0.9539999723434448, "reward_std": 0.24900673925876618, "rewards/reward_func/mean": 0.9539999723434448, "rewards/reward_func/std": 0.2490067459642887, "step": 60, "step_time": 13.735741792595945, "tools/call_frequency": 3.35, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55, "completions/max_length": 213.6, "completions/max_terminated_length": 186.6, "completions/mean_length": 196.15, "completions/mean_terminated_length": 181.2, "completions/min_length": 173.2, "completions/min_terminated_length": 177.0, "entropy": 0.18931779703125357, "epoch": 0.40625, "frac_reward_zero_std": 0.2, "grad_norm": 0.3368631601333618, "kl": 0.19928277991712093, "learning_rate": 3.7200000000000004e-06, "loss": -0.03082091510295868, "num_tokens": 207221.0, "reward": 1.1948333382606506, "reward_std": 0.3531351625919342, "rewards/reward_func/mean": 1.1948333382606506, "rewards/reward_func/std": 0.3531351566314697, "step": 65, "step_time": 14.853071747999639, "tools/call_frequency": 3.45, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 232.4, "completions/max_terminated_length": 231.4, "completions/mean_length": 214.8, "completions/mean_terminated_length": 221.10000305175782, "completions/min_length": 199.4, "completions/min_terminated_length": 211.6, "entropy": 0.20331259737722576, "epoch": 0.4375, "frac_reward_zero_std": 0.6, "grad_norm": 3.155299663543701, "kl": 0.21616111248731612, "learning_rate": 3.62e-06, "loss": -0.014388753473758698, "num_tokens": 223949.0, "reward": 1.187999999523163, "reward_std": 0.06400000005960464, "rewards/reward_func/mean": 1.187999999523163, "rewards/reward_func/std": 0.06399999856948853, "step": 70, "step_time": 15.7972018689994, "tools/call_frequency": 3.3, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 228.0, "completions/max_terminated_length": 180.2, "completions/mean_length": 217.2, "completions/mean_terminated_length": 174.9, "completions/min_length": 208.2, "completions/min_terminated_length": 169.6, "entropy": 0.09874274502508343, "epoch": 0.46875, "frac_reward_zero_std": 0.4, "grad_norm": 0.1496252417564392, "kl": 0.19251887053251265, "learning_rate": 3.52e-06, "loss": 0.0129203662276268, "num_tokens": 240663.0, "reward": 1.166100013256073, "reward_std": 0.27513332962989806, "rewards/reward_func/mean": 1.166100013256073, "rewards/reward_func/std": 0.275133341550827, "step": 75, "step_time": 15.653593644002104, "tools/call_frequency": 3.15, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4, "completions/max_length": 250.6, "completions/max_terminated_length": 235.4, "completions/mean_length": 217.2, "completions/mean_terminated_length": 209.23333740234375, "completions/min_length": 169.8, "completions/min_terminated_length": 174.0, "entropy": 0.18624852728098631, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.9008662700653076, "kl": 0.19779104925692081, "learning_rate": 3.4200000000000007e-06, "loss": -0.060715597867965695, "num_tokens": 257232.0, "reward": 1.094600009918213, "reward_std": 0.533681058883667, "rewards/reward_func/mean": 1.094600009918213, "rewards/reward_func/std": 0.5336810708045959, "step": 80, "step_time": 16.87674882839783, "tools/call_frequency": 2.7, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 263.2, "completions/max_terminated_length": 149.0, "completions/mean_length": 238.9, "completions/mean_terminated_length": 138.2, "completions/min_length": 220.0, "completions/min_terminated_length": 127.4, "entropy": 0.06761846686713398, "epoch": 0.53125, "frac_reward_zero_std": 0.6, "grad_norm": 0.0583312027156353, "kl": 0.16298045124858618, "learning_rate": 3.3200000000000004e-06, "loss": 0.0317715585231781, "num_tokens": 274377.0, "reward": 1.168333351612091, "reward_std": 0.21399999260902405, "rewards/reward_func/mean": 1.168333351612091, "rewards/reward_func/std": 0.214000004529953, "step": 85, "step_time": 19.253501980405417, "tools/call_frequency": 2.45, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9, "completions/max_length": 252.0, "completions/max_terminated_length": 79.6, "completions/mean_length": 229.35, "completions/mean_terminated_length": 79.6, "completions/min_length": 212.6, "completions/min_terminated_length": 79.6, "entropy": 0.04304317501373589, "epoch": 0.5625, "frac_reward_zero_std": 0.4, "grad_norm": 0.05765737593173981, "kl": 0.1589741975069046, "learning_rate": 3.2200000000000005e-06, "loss": -0.009884151071310044, "num_tokens": 291640.0, "reward": 1.0771000266075135, "reward_std": 0.2571271777153015, "rewards/reward_func/mean": 1.0771000266075135, "rewards/reward_func/std": 0.257127183675766, "step": 90, "step_time": 19.810263851404308, "tools/call_frequency": 2.7, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 252.2, "completions/max_terminated_length": 155.6, "completions/mean_length": 227.45, "completions/mean_terminated_length": 154.7, "completions/min_length": 202.6, "completions/min_terminated_length": 153.8, "entropy": 0.03856636304408312, "epoch": 0.59375, "frac_reward_zero_std": 0.2, "grad_norm": 2.2899415493011475, "kl": 0.18391469195485116, "learning_rate": 3.12e-06, "loss": 0.012278559803962707, "num_tokens": 308671.0, "reward": 0.9493666887283325, "reward_std": 0.3057107627391815, "rewards/reward_func/mean": 0.9493666887283325, "rewards/reward_func/std": 0.3057107746601105, "step": 95, "step_time": 18.270148772597896, "tools/call_frequency": 2.75, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65, "completions/max_length": 229.0, "completions/max_terminated_length": 128.6, "completions/mean_length": 210.05, "completions/mean_terminated_length": 121.23333435058593, "completions/min_length": 191.4, "completions/min_terminated_length": 114.6, "entropy": 0.03718785918317735, "epoch": 0.625, "frac_reward_zero_std": 0.2, "grad_norm": 1.4016427993774414, "kl": 0.19276840873062612, "learning_rate": 3.0200000000000003e-06, "loss": -0.02043401300907135, "num_tokens": 325246.0, "reward": 0.9758000135421753, "reward_std": 0.439729905128479, "rewards/reward_func/mean": 0.9758000135421753, "rewards/reward_func/std": 0.439729905128479, "step": 100, "step_time": 16.536685503809714, "tools/call_frequency": 3.4, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 231.0, "completions/max_terminated_length": 175.6, "completions/mean_length": 209.7, "completions/mean_terminated_length": 164.73333435058595, "completions/min_length": 197.2, "completions/min_terminated_length": 156.2, "entropy": 0.0890876273624599, "epoch": 0.65625, "frac_reward_zero_std": 0.6, "grad_norm": 1.857412338256836, "kl": 0.20793221928179265, "learning_rate": 2.92e-06, "loss": 0.010671529173851012, "num_tokens": 341743.0, "reward": 1.244200015068054, "reward_std": 0.25437753796577456, "rewards/reward_func/mean": 1.244200015068054, "rewards/reward_func/std": 0.25437754988670347, "step": 105, "step_time": 14.550393618003, "tools/call_frequency": 3.4, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6, "completions/max_length": 228.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 211.05, "completions/mean_terminated_length": 204.6, "completions/min_length": 194.2, "completions/min_terminated_length": 194.2, "entropy": 0.09650332322344184, "epoch": 0.6875, "frac_reward_zero_std": 0.0, "grad_norm": 0.5915409922599792, "kl": 0.1943995427340269, "learning_rate": 2.82e-06, "loss": -0.007803649455308914, "num_tokens": 358474.0, "reward": 0.9213667035102844, "reward_std": 0.48010437488555907, "rewards/reward_func/mean": 0.9213667035102844, "rewards/reward_func/std": 0.480104398727417, "step": 110, "step_time": 15.29034832160105, "tools/call_frequency": 3.3, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 244.8, "completions/max_terminated_length": 244.8, "completions/mean_length": 224.4, "completions/mean_terminated_length": 232.93333435058594, "completions/min_length": 204.4, "completions/min_terminated_length": 219.8, "entropy": 0.06257005939260125, "epoch": 0.71875, "frac_reward_zero_std": 0.2, "grad_norm": 0.09643584489822388, "kl": 0.18671961799263953, "learning_rate": 2.7200000000000002e-06, "loss": 0.0009367348626255989, "num_tokens": 375512.0, "reward": 0.9198000192642212, "reward_std": 0.41239041090011597, "rewards/reward_func/mean": 0.9198000192642212, "rewards/reward_func/std": 0.41239042282104493, "step": 115, "step_time": 16.68962257000094, "tools/call_frequency": 3.05, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 238.4, "completions/max_terminated_length": 194.2, "completions/mean_length": 221.6, "completions/mean_terminated_length": 185.86666870117188, "completions/min_length": 202.6, "completions/min_terminated_length": 174.8, "entropy": 0.19831047160550952, "epoch": 0.75, "frac_reward_zero_std": 0.4, "grad_norm": 0.06480103731155396, "kl": 0.2127195455133915, "learning_rate": 2.6200000000000003e-06, "loss": -0.002893347479403019, "num_tokens": 392259.0, "reward": 1.1177000164985658, "reward_std": 0.34459384679794314, "rewards/reward_func/mean": 1.1177000164985658, "rewards/reward_func/std": 0.34459385871887205, "step": 120, "step_time": 15.74592421480629, "tools/call_frequency": 3.1, "tools/failure_frequency": 0.0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55, "completions/max_length": 249.6, "completions/max_terminated_length": 248.4, "completions/mean_length": 225.9, "completions/mean_terminated_length": 238.83333435058594, "completions/min_length": 207.6, "completions/min_terminated_length": 229.0, "entropy": 0.12472135615535081, "epoch": 0.78125, "frac_reward_zero_std": 0.2, "grad_norm": 1.3778189420700073, "kl": 0.22096077986061574, "learning_rate": 2.52e-06, "loss": 0.018771570920944215, "num_tokens": 409108.0, "reward": 0.6021333426237107, "reward_std": 0.6133833765983582, "rewards/reward_func/mean": 0.6021333426237107, "rewards/reward_func/std": 0.6133833885192871, "step": 125, "step_time": 17.135429813191877, "tools/call_frequency": 3.0, "tools/failure_frequency": 0.0 } ], "logging_steps": 5, "max_steps": 250, "num_input_tokens_seen": 409108, "num_train_epochs": 2, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }