{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.28, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15, "completions/max_length": 920.8, "completions/max_terminated_length": 815.0, "completions/mean_length": 575.9125, "completions/mean_terminated_length": 512.0958465576172, "completions/min_length": 250.5, "completions/min_terminated_length": 250.5, "entropy": 1.1609928414225579, "epoch": 0.08, "frac_reward_zero_std": 0.225, "grad_norm": 4.6875, "kl": 0.13114853966326337, "learning_rate": 9.4375e-06, "loss": 0.012817811965942384, "num_tokens": 66717.0, "reward": 0.730865478515625, "reward_std": 0.7215771168470383, "rewards/JointRewardFunction/mean": 0.730865478515625, "rewards/JointRewardFunction/std": 0.7215771168470383, "step": 10, "step_time": 38.070080343699374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.7, "completions/max_terminated_length": 391.7, "completions/mean_length": 177.925, "completions/mean_terminated_length": 177.925, "completions/min_length": 49.9, "completions/min_terminated_length": 49.9, "entropy": 2.0190576702356338, "epoch": 0.16, "frac_reward_zero_std": 0.1, "grad_norm": 17.375, "kl": 1.693755827844143, "learning_rate": 8.8125e-06, "loss": 0.07697797417640687, "num_tokens": 101595.0, "reward": 0.496136474609375, "reward_std": 0.7558781564235687, "rewards/JointRewardFunction/mean": 0.496136474609375, "rewards/JointRewardFunction/std": 0.7558781564235687, "step": 20, "step_time": 18.131606991801892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.175, "completions/max_length": 988.7, "completions/max_terminated_length": 802.5, "completions/mean_length": 531.1125, "completions/mean_terminated_length": 433.47263793945314, "completions/min_length": 180.2, "completions/min_terminated_length": 180.2, "entropy": 1.3736804876476527, "epoch": 0.24, "frac_reward_zero_std": 0.225, "grad_norm": 4.4375, "kl": 0.4292584811337292, "learning_rate": 8.1875e-06, "loss": 0.12034453153610229, "num_tokens": 164528.0, "reward": 0.727508544921875, "reward_std": 0.5448802873492241, "rewards/JointRewardFunction/mean": 0.727508544921875, "rewards/JointRewardFunction/std": 0.5448802877217531, "step": 30, "step_time": 42.41746253830024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1125, "completions/max_length": 926.7, "completions/max_terminated_length": 754.3, "completions/mean_length": 494.75, "completions/mean_terminated_length": 441.88750610351565, "completions/min_length": 215.1, "completions/min_terminated_length": 215.1, "entropy": 1.0656868401914834, "epoch": 0.32, "frac_reward_zero_std": 0.25, "grad_norm": 5.125, "kl": 0.3164741122163832, "learning_rate": 7.5625e-06, "loss": 0.0320564866065979, "num_tokens": 224682.0, "reward": 0.874420166015625, "reward_std": 0.6246562719345092, "rewards/JointRewardFunction/mean": 0.874420166015625, "rewards/JointRewardFunction/std": 0.6246562749147415, "step": 40, "step_time": 39.87609361880059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.9, "completions/max_terminated_length": 629.9, "completions/mean_length": 327.525, "completions/mean_terminated_length": 327.525, "completions/min_length": 114.7, "completions/min_terminated_length": 114.7, "entropy": 1.2003333028405905, "epoch": 0.4, "frac_reward_zero_std": 0.2, "grad_norm": 7.5, "kl": 0.43942997977137566, "learning_rate": 6.9375e-06, "loss": 0.022251369059085847, "num_tokens": 271714.0, "reward": 1.063922119140625, "reward_std": 0.6383892238140106, "rewards/JointRewardFunction/mean": 1.063922119140625, "rewards/JointRewardFunction/std": 0.6383892357349396, "step": 50, "step_time": 27.557733834501413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 641.7, "completions/max_terminated_length": 590.3, "completions/mean_length": 302.5, "completions/mean_terminated_length": 293.0178588867187, "completions/min_length": 102.5, "completions/min_terminated_length": 102.5, "entropy": 1.3896781962364912, "epoch": 0.48, "frac_reward_zero_std": 0.225, "grad_norm": 4.375, "kl": 0.5564426928758621, "learning_rate": 6.3125e-06, "loss": -0.0015912272036075592, "num_tokens": 316254.0, "reward": 0.9928466796875, "reward_std": 0.6274125874042511, "rewards/JointRewardFunction/mean": 0.9928466796875, "rewards/JointRewardFunction/std": 0.6274125933647156, "step": 60, "step_time": 28.0631938617993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.9, "completions/max_terminated_length": 586.9, "completions/mean_length": 294.55, "completions/mean_terminated_length": 294.55, "completions/min_length": 114.6, "completions/min_terminated_length": 114.6, "entropy": 1.4918637625873088, "epoch": 0.56, "frac_reward_zero_std": 0.25, "grad_norm": 5.15625, "kl": 0.5497013371437788, "learning_rate": 5.6875e-06, "loss": 0.047078275680541994, "num_tokens": 360274.0, "reward": 1.1190673828125, "reward_std": 0.609263214468956, "rewards/JointRewardFunction/mean": 1.1190673828125, "rewards/JointRewardFunction/std": 0.609263226389885, "step": 70, "step_time": 26.138083929998174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 847.5, "completions/max_terminated_length": 678.8, "completions/mean_length": 365.3375, "completions/mean_terminated_length": 321.4898880004883, "completions/min_length": 90.4, "completions/min_terminated_length": 90.4, "entropy": 1.5532763354480266, "epoch": 0.64, "frac_reward_zero_std": 0.175, "grad_norm": 5.78125, "kl": 0.4981705164536834, "learning_rate": 5.0625e-06, "loss": 0.11404727697372437, "num_tokens": 410115.0, "reward": 0.9597900390625, "reward_std": 0.6095708787441254, "rewards/JointRewardFunction/mean": 0.9597900390625, "rewards/JointRewardFunction/std": 0.6095708817243576, "step": 80, "step_time": 36.49904953569931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 732.5, "completions/max_terminated_length": 677.8, "completions/mean_length": 299.875, "completions/mean_terminated_length": 282.1875030517578, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 1.4728075489401817, "epoch": 0.72, "frac_reward_zero_std": 0.225, "grad_norm": 3.5625, "kl": 0.4925263602286577, "learning_rate": 4.4375e-06, "loss": 0.054662787914276124, "num_tokens": 454725.0, "reward": 1.019183349609375, "reward_std": 0.61219422519207, "rewards/JointRewardFunction/mean": 1.019183349609375, "rewards/JointRewardFunction/std": 0.6121942341327667, "step": 90, "step_time": 31.705154634100108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 760.6, "completions/max_terminated_length": 660.6, "completions/mean_length": 352.475, "completions/mean_terminated_length": 312.0821502685547, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 1.3954405788332225, "epoch": 0.8, "frac_reward_zero_std": 0.125, "grad_norm": 5.78125, "kl": 0.4322750365361571, "learning_rate": 3.8125e-06, "loss": 0.009273938834667206, "num_tokens": 503589.0, "reward": 0.9172119140625, "reward_std": 0.6775627821683884, "rewards/JointRewardFunction/mean": 0.9172119140625, "rewards/JointRewardFunction/std": 0.6775627851486206, "step": 100, "step_time": 33.002576629400934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 687.7, "completions/max_terminated_length": 653.3, "completions/mean_length": 315.025, "completions/mean_terminated_length": 306.5357147216797, "completions/min_length": 97.6, "completions/min_terminated_length": 97.6, "entropy": 1.4698836095631123, "epoch": 0.88, "frac_reward_zero_std": 0.275, "grad_norm": 8.125, "kl": 0.4709475075826049, "learning_rate": 3.1875e-06, "loss": 0.09247303009033203, "num_tokens": 549335.0, "reward": 0.997833251953125, "reward_std": 0.6001621454954147, "rewards/JointRewardFunction/mean": 0.997833251953125, "rewards/JointRewardFunction/std": 0.6001621544361114, "step": 110, "step_time": 30.22435642040182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 865.7, "completions/max_terminated_length": 724.8, "completions/mean_length": 377.4125, "completions/mean_terminated_length": 342.6226211547852, "completions/min_length": 102.6, "completions/min_terminated_length": 102.6, "entropy": 1.4377100937068463, "epoch": 0.96, "frac_reward_zero_std": 0.225, "grad_norm": 3.15625, "kl": 0.4150696059688926, "learning_rate": 2.5625e-06, "loss": -0.021556401252746583, "num_tokens": 599900.0, "reward": 1.0236083984375, "reward_std": 0.6392961710691452, "rewards/JointRewardFunction/mean": 1.0236083984375, "rewards/JointRewardFunction/std": 0.6392962068319321, "step": 120, "step_time": 37.14680036250138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 698.9, "completions/max_terminated_length": 598.9, "completions/mean_length": 299.0125, "completions/mean_terminated_length": 260.36726531982424, "completions/min_length": 72.3, "completions/min_terminated_length": 72.3, "entropy": 1.3930659301578998, "epoch": 1.04, "frac_reward_zero_std": 0.2, "grad_norm": 5.71875, "kl": 0.48926177779212593, "learning_rate": 1.9375e-06, "loss": 0.016721364855766297, "num_tokens": 644439.0, "reward": 0.972833251953125, "reward_std": 0.6092687845230103, "rewards/JointRewardFunction/mean": 0.972833251953125, "rewards/JointRewardFunction/std": 0.6092687904834747, "step": 130, "step_time": 30.47871646400163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.9, "completions/max_terminated_length": 623.9, "completions/mean_length": 286.925, "completions/mean_terminated_length": 286.925, "completions/min_length": 71.1, "completions/min_terminated_length": 71.1, "entropy": 1.4094880308955908, "epoch": 1.12, "frac_reward_zero_std": 0.35, "grad_norm": 10.8125, "kl": 0.4836927611380816, "learning_rate": 1.3125000000000001e-06, "loss": 0.08277995586395263, "num_tokens": 688041.0, "reward": 1.046673583984375, "reward_std": 0.5450149297714233, "rewards/JointRewardFunction/mean": 1.046673583984375, "rewards/JointRewardFunction/std": 0.5450149476528168, "step": 140, "step_time": 27.556023125701177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 781.7, "completions/max_terminated_length": 652.2, "completions/mean_length": 349.275, "completions/mean_terminated_length": 317.290478515625, "completions/min_length": 88.2, "completions/min_terminated_length": 88.2, "entropy": 1.334358724579215, "epoch": 1.2, "frac_reward_zero_std": 0.15, "grad_norm": 5.9375, "kl": 0.44102346180006863, "learning_rate": 6.875000000000001e-07, "loss": 0.12827397584915162, "num_tokens": 736559.0, "reward": 0.999505615234375, "reward_std": 0.6881234139204025, "rewards/JointRewardFunction/mean": 0.999505615234375, "rewards/JointRewardFunction/std": 0.6881234139204025, "step": 150, "step_time": 33.79141057000161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 678.4, "completions/max_terminated_length": 558.1, "completions/mean_length": 299.375, "completions/mean_terminated_length": 261.2517883300781, "completions/min_length": 71.7, "completions/min_terminated_length": 71.7, "entropy": 1.3799165710806847, "epoch": 1.28, "frac_reward_zero_std": 0.175, "grad_norm": 7.21875, "kl": 0.4770205244421959, "learning_rate": 6.250000000000001e-08, "loss": 0.08484221696853637, "num_tokens": 781411.0, "reward": 0.87017822265625, "reward_std": 0.6314140051603317, "rewards/JointRewardFunction/mean": 0.87017822265625, "rewards/JointRewardFunction/std": 0.6314140141010285, "step": 160, "step_time": 30.14368499219854 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 781411, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }