{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0666666666666667, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 419.8, "completions/max_terminated_length": 406.8, "completions/mean_length": 314.8625, "completions/mean_terminated_length": 307.5414337158203, "completions/min_length": 206.3, "completions/min_terminated_length": 206.3, "entropy": 0.44715926721692084, "epoch": 0.06666666666666667, "frac_reward_zero_std": 0.15, "grad_norm": 4.625, "kl": 0.09871633112052222, "learning_rate": 9.4375e-06, "loss": -0.010261553525924682, "num_tokens": 38057.0, "reward": 0.9905517578125, "reward_std": 0.6565703094005585, "rewards/JointRewardFunction/mean": 0.9905517578125, "rewards/JointRewardFunction/std": 0.6565703094005585, "step": 10, "step_time": 19.035294922399043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2, "completions/max_length": 458.4, "completions/max_terminated_length": 426.1, "completions/mean_length": 343.8, "completions/mean_terminated_length": 313.08583984375, "completions/min_length": 211.7, "completions/min_terminated_length": 211.7, "entropy": 0.31359912948682905, "epoch": 0.13333333333333333, "frac_reward_zero_std": 0.1, "grad_norm": 3.125, "kl": 0.14868967132642866, "learning_rate": 8.8125e-06, "loss": -0.023439544439315795, "num_tokens": 79321.0, "reward": 1.072357177734375, "reward_std": 0.5760923266410828, "rewards/JointRewardFunction/mean": 1.072357177734375, "rewards/JointRewardFunction/std": 0.5760923385620117, "step": 20, "step_time": 21.0788949906022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.8, "completions/max_terminated_length": 351.8, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 174.2, "completions/min_terminated_length": 174.2, "entropy": 0.3843053654767573, "epoch": 0.2, "frac_reward_zero_std": 0.1, "grad_norm": 4.5625, "kl": 0.18629266200587152, "learning_rate": 8.1875e-06, "loss": 0.004870641976594925, "num_tokens": 111587.0, "reward": 1.3501953125, "reward_std": 0.6562108278274537, "rewards/JointRewardFunction/mean": 1.3501953125, "rewards/JointRewardFunction/std": 0.6562108367681503, "step": 30, "step_time": 16.898221987598664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0375, "completions/max_length": 429.1, "completions/max_terminated_length": 425.2, "completions/mean_length": 327.5125, "completions/mean_terminated_length": 321.53809814453126, "completions/min_length": 227.9, "completions/min_terminated_length": 227.9, "entropy": 0.44414809048175813, "epoch": 0.26666666666666666, "frac_reward_zero_std": 0.35, "grad_norm": 6.34375, "kl": 0.14110819818452, "learning_rate": 7.5625e-06, "loss": -0.03268973827362061, "num_tokens": 151100.0, "reward": 1.493701171875, "reward_std": 0.3899420065339655, "rewards/JointRewardFunction/mean": 1.493701171875, "rewards/JointRewardFunction/std": 0.38994201249442995, "step": 40, "step_time": 20.12026042639991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.225, "completions/max_length": 472.2, "completions/max_terminated_length": 427.6, "completions/mean_length": 352.75, "completions/mean_terminated_length": 313.9489349365234, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.5324544316157699, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.25, "grad_norm": 5.3125, "kl": 0.13007921809330583, "learning_rate": 6.9375e-06, "loss": -0.03016577959060669, "num_tokens": 192036.0, "reward": 1.0732177734375, "reward_std": 0.5164182722568512, "rewards/JointRewardFunction/mean": 1.0732177734375, "rewards/JointRewardFunction/std": 0.5164182841777801, "step": 50, "step_time": 21.588160099001833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.225, "completions/max_length": 495.4, "completions/max_terminated_length": 461.4, "completions/mean_length": 396.95, "completions/mean_terminated_length": 363.4666687011719, "completions/min_length": 224.4, "completions/min_terminated_length": 224.4, "entropy": 0.47121726758778093, "epoch": 0.4, "frac_reward_zero_std": 0.05, "grad_norm": 2.46875, "kl": 0.10462474231608213, "learning_rate": 6.3125e-06, "loss": -0.031146174669265746, "num_tokens": 237192.0, "reward": 1.193585205078125, "reward_std": 0.5929911494255066, "rewards/JointRewardFunction/mean": 1.193585205078125, "rewards/JointRewardFunction/std": 0.592991155385971, "step": 60, "step_time": 22.437090325800817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.275, "completions/max_length": 497.6, "completions/max_terminated_length": 455.6, "completions/mean_length": 392.0375, "completions/mean_terminated_length": 351.8452484130859, "completions/min_length": 243.3, "completions/min_terminated_length": 243.3, "entropy": 0.4913482774049044, "epoch": 0.4666666666666667, "frac_reward_zero_std": 0.2, "grad_norm": 4.21875, "kl": 0.11023280541412532, "learning_rate": 5.6875e-06, "loss": -0.02786441147327423, "num_tokens": 282351.0, "reward": 1.166510009765625, "reward_std": 0.5283917605876922, "rewards/JointRewardFunction/mean": 1.166510009765625, "rewards/JointRewardFunction/std": 0.528391769528389, "step": 70, "step_time": 22.72093682029881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 507.2, "completions/max_terminated_length": 447.3, "completions/mean_length": 409.9125, "completions/mean_terminated_length": 365.60191040039064, "completions/min_length": 248.5, "completions/min_terminated_length": 248.5, "entropy": 0.43941801562905314, "epoch": 0.5333333333333333, "frac_reward_zero_std": 0.2, "grad_norm": 1.640625, "kl": 0.12287256089039147, "learning_rate": 5.0625e-06, "loss": -0.024439637362957, "num_tokens": 328816.0, "reward": 1.467431640625, "reward_std": 0.4256438536103815, "rewards/JointRewardFunction/mean": 1.467431640625, "rewards/JointRewardFunction/std": 0.42564386553131045, "step": 80, "step_time": 23.137156244499057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 486.2, "completions/max_terminated_length": 463.3, "completions/mean_length": 379.0625, "completions/mean_terminated_length": 360.1238128662109, "completions/min_length": 241.6, "completions/min_terminated_length": 241.6, "entropy": 0.3873957570642233, "epoch": 0.6, "frac_reward_zero_std": 0.35, "grad_norm": 2.15625, "kl": 0.12395602646283806, "learning_rate": 4.4375e-06, "loss": 0.0433960497379303, "num_tokens": 373289.0, "reward": 1.3514892578125, "reward_std": 0.43082923418842256, "rewards/JointRewardFunction/mean": 1.3514892578125, "rewards/JointRewardFunction/std": 0.4308292371686548, "step": 90, "step_time": 22.175798371599377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1375, "completions/max_length": 475.4, "completions/max_terminated_length": 439.4, "completions/mean_length": 344.5625, "completions/mean_terminated_length": 319.0309600830078, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.3818320112302899, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.15, "grad_norm": 1.7265625, "kl": 0.1494429545942694, "learning_rate": 3.8125e-06, "loss": -0.00486765056848526, "num_tokens": 415738.0, "reward": 1.4651611328125, "reward_std": 0.453005512803793, "rewards/JointRewardFunction/mean": 1.4651611328125, "rewards/JointRewardFunction/std": 0.45300552546977996, "step": 100, "step_time": 21.99789129000128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 500.8, "completions/max_terminated_length": 454.0, "completions/mean_length": 348.225, "completions/mean_terminated_length": 326.1460815429688, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.38654828127473595, "epoch": 0.7333333333333333, "frac_reward_zero_std": 0.25, "grad_norm": 4.25, "kl": 0.19588984637521206, "learning_rate": 3.1875e-06, "loss": -0.04184774756431579, "num_tokens": 456544.0, "reward": 1.449822998046875, "reward_std": 0.40167671740055083, "rewards/JointRewardFunction/mean": 1.449822998046875, "rewards/JointRewardFunction/std": 0.40167671740055083, "step": 110, "step_time": 22.696052773901464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 474.8, "completions/max_terminated_length": 426.7, "completions/mean_length": 338.5375, "completions/mean_terminated_length": 308.2898864746094, "completions/min_length": 179.5, "completions/min_terminated_length": 179.5, "entropy": 0.42813673401251434, "epoch": 0.8, "frac_reward_zero_std": 0.15, "grad_norm": 3.625, "kl": 0.188834874285385, "learning_rate": 2.5625e-06, "loss": -0.022572511434555055, "num_tokens": 495891.0, "reward": 1.37578125, "reward_std": 0.485464009642601, "rewards/JointRewardFunction/mean": 1.37578125, "rewards/JointRewardFunction/std": 0.4854640245437622, "step": 120, "step_time": 21.932242763400062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.225, "completions/max_length": 480.4, "completions/max_terminated_length": 457.0, "completions/mean_length": 353.0375, "completions/mean_terminated_length": 309.4391723632813, "completions/min_length": 93.9, "completions/min_terminated_length": 93.9, "entropy": 0.4207899322733283, "epoch": 0.8666666666666667, "frac_reward_zero_std": 0.15, "grad_norm": 3.140625, "kl": 0.19314469541423024, "learning_rate": 1.9375e-06, "loss": -0.10344405174255371, "num_tokens": 538386.0, "reward": 1.3167236328125, "reward_std": 0.5021238774061203, "rewards/JointRewardFunction/mean": 1.3167236328125, "rewards/JointRewardFunction/std": 0.5021239161491394, "step": 130, "step_time": 22.154275490202053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1375, "completions/max_length": 468.1, "completions/max_terminated_length": 443.4, "completions/mean_length": 351.3625, "completions/mean_terminated_length": 333.25274353027345, "completions/min_length": 215.1, "completions/min_terminated_length": 215.1, "entropy": 0.3897366087883711, "epoch": 0.9333333333333333, "frac_reward_zero_std": 0.4, "grad_norm": 1.8828125, "kl": 0.1783840640448034, "learning_rate": 1.3125000000000001e-06, "loss": -0.012397536635398864, "num_tokens": 579127.0, "reward": 1.5228759765625, "reward_std": 0.3425671649631113, "rewards/JointRewardFunction/mean": 1.5228759765625, "rewards/JointRewardFunction/std": 0.3425671679433435, "step": 140, "step_time": 21.521125249801845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15, "completions/max_length": 482.1, "completions/max_terminated_length": 444.4, "completions/mean_length": 354.5375, "completions/mean_terminated_length": 328.6072692871094, "completions/min_length": 207.4, "completions/min_terminated_length": 207.4, "entropy": 0.4106031972914934, "epoch": 1.0, "frac_reward_zero_std": 0.2, "grad_norm": 1.703125, "kl": 0.16290537370368838, "learning_rate": 6.875000000000001e-07, "loss": 0.014532069861888885, "num_tokens": 619802.0, "reward": 1.502001953125, "reward_std": 0.44335485696792604, "rewards/JointRewardFunction/mean": 1.502001953125, "rewards/JointRewardFunction/std": 0.443354868888855, "step": 150, "step_time": 22.203639667498646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 432.7, "completions/max_terminated_length": 389.6, "completions/mean_length": 310.0125, "completions/mean_terminated_length": 283.19702911376953, "completions/min_length": 129.3, "completions/min_terminated_length": 129.3, "entropy": 0.43815543316304684, "epoch": 1.0666666666666667, "frac_reward_zero_std": 0.2, "grad_norm": 3.484375, "kl": 0.21010836036875843, "learning_rate": 6.250000000000001e-08, "loss": -0.05822192430496216, "num_tokens": 656519.0, "reward": 1.34354248046875, "reward_std": 0.5034710764884949, "rewards/JointRewardFunction/mean": 1.34354248046875, "rewards/JointRewardFunction/std": 0.503471103310585, "step": 160, "step_time": 20.17411774720167 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 656519, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }