{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.28, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 959.3, "completions/max_terminated_length": 883.0, "completions/mean_length": 628.5875, "completions/mean_terminated_length": 604.4089508056641, "completions/min_length": 324.2, "completions/min_terminated_length": 324.2, "entropy": 0.8960087668150664, "epoch": 0.08, "frac_reward_zero_std": 0.225, "grad_norm": 1.734375, "kl": 0.04410275101472507, "learning_rate": 9.4375e-06, "loss": -0.004480601102113724, "num_tokens": 70931.0, "reward": 0.637615966796875, "reward_std": 0.4684956520795822, "rewards/JointRewardFunction/mean": 0.637615966796875, "rewards/JointRewardFunction/std": 0.4684956640005112, "step": 10, "step_time": 39.2735027824996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1375, "completions/max_length": 943.7, "completions/max_terminated_length": 829.8, "completions/mean_length": 631.4625, "completions/mean_terminated_length": 579.1692932128906, "completions/min_length": 324.2, "completions/min_terminated_length": 324.2, "entropy": 1.0276442520320415, "epoch": 0.16, "frac_reward_zero_std": 0.25, "grad_norm": 1.703125, "kl": 0.08849158070515842, "learning_rate": 8.8125e-06, "loss": 0.01496470272541046, "num_tokens": 142092.0, "reward": 0.539324951171875, "reward_std": 0.49008582532405853, "rewards/JointRewardFunction/mean": 0.539324951171875, "rewards/JointRewardFunction/std": 0.4900858402252197, "step": 20, "step_time": 40.54311619299951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 863.4, "completions/max_terminated_length": 835.9, "completions/mean_length": 554.5875, "completions/mean_terminated_length": 548.3660766601563, "completions/min_length": 307.3, "completions/min_terminated_length": 307.3, "entropy": 1.1838637091219426, "epoch": 0.24, "frac_reward_zero_std": 0.1, "grad_norm": 3.046875, "kl": 0.11827877229079604, "learning_rate": 8.1875e-06, "loss": 0.009982097893953323, "num_tokens": 206903.0, "reward": 0.6125, "reward_std": 0.4704344987869263, "rewards/JointRewardFunction/mean": 0.6125, "rewards/JointRewardFunction/std": 0.47043450474739074, "step": 30, "step_time": 37.010521245800916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 742.1, "completions/max_terminated_length": 659.0, "completions/mean_length": 425.7, "completions/mean_terminated_length": 410.8892883300781, "completions/min_length": 219.2, "completions/min_terminated_length": 219.2, "entropy": 1.2652597405016421, "epoch": 0.32, "frac_reward_zero_std": 0.15, "grad_norm": 6.125, "kl": 0.3052154924720526, "learning_rate": 7.5625e-06, "loss": 0.027592796087265014, "num_tokens": 261533.0, "reward": 0.61771240234375, "reward_std": 0.46430147290229795, "rewards/JointRewardFunction/mean": 0.61771240234375, "rewards/JointRewardFunction/std": 0.46430149376392366, "step": 40, "step_time": 32.063390286398864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0875, "completions/max_length": 919.3, "completions/max_terminated_length": 713.2, "completions/mean_length": 511.4125, "completions/mean_terminated_length": 464.44822998046874, "completions/min_length": 244.6, "completions/min_terminated_length": 244.6, "entropy": 1.3154176332056522, "epoch": 0.4, "frac_reward_zero_std": 0.1, "grad_norm": 3.6875, "kl": 0.24474074998870493, "learning_rate": 6.9375e-06, "loss": 0.001818625070154667, "num_tokens": 323276.0, "reward": 0.537213134765625, "reward_std": 0.5033262223005295, "rewards/JointRewardFunction/mean": 0.537213134765625, "rewards/JointRewardFunction/std": 0.5033262312412262, "step": 50, "step_time": 39.09056931000159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 824.5, "completions/max_terminated_length": 711.6, "completions/mean_length": 520.4625, "completions/mean_terminated_length": 484.48512268066406, "completions/min_length": 261.1, "completions/min_terminated_length": 261.1, "entropy": 1.2718341693282127, "epoch": 0.48, "frac_reward_zero_std": 0.225, "grad_norm": 4.6875, "kl": 0.2580411507748067, "learning_rate": 6.3125e-06, "loss": 0.014349016547203063, "num_tokens": 385253.0, "reward": 0.54111328125, "reward_std": 0.4758811920881271, "rewards/JointRewardFunction/mean": 0.54111328125, "rewards/JointRewardFunction/std": 0.47588120102882386, "step": 60, "step_time": 35.274126689498736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 943.6, "completions/max_terminated_length": 808.7, "completions/mean_length": 537.75, "completions/mean_terminated_length": 502.15358276367186, "completions/min_length": 267.1, "completions/min_terminated_length": 267.1, "entropy": 1.244665590673685, "epoch": 0.56, "frac_reward_zero_std": 0.175, "grad_norm": 4.0625, "kl": 0.24244523425586523, "learning_rate": 5.6875e-06, "loss": 0.0023317448794841766, "num_tokens": 448729.0, "reward": 0.69791259765625, "reward_std": 0.4654367908835411, "rewards/JointRewardFunction/mean": 0.69791259765625, "rewards/JointRewardFunction/std": 0.4654367953538895, "step": 70, "step_time": 40.008808337500525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 810.1, "completions/max_terminated_length": 720.2, "completions/mean_length": 534.1625, "completions/mean_terminated_length": 508.99286193847655, "completions/min_length": 286.4, "completions/min_terminated_length": 286.4, "entropy": 1.2685907267034053, "epoch": 0.64, "frac_reward_zero_std": 0.225, "grad_norm": 4.125, "kl": 0.2162147051654756, "learning_rate": 5.0625e-06, "loss": 0.014069165289402007, "num_tokens": 512076.0, "reward": 0.728045654296875, "reward_std": 0.5096077308058738, "rewards/JointRewardFunction/mean": 0.728045654296875, "rewards/JointRewardFunction/std": 0.5096077516674995, "step": 80, "step_time": 34.66793936170143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 793.3, "completions/max_terminated_length": 733.4, "completions/mean_length": 492.2, "completions/mean_terminated_length": 478.558935546875, "completions/min_length": 245.4, "completions/min_terminated_length": 245.4, "entropy": 1.24496211335063, "epoch": 0.72, "frac_reward_zero_std": 0.175, "grad_norm": 4.03125, "kl": 0.24482853645458819, "learning_rate": 4.4375e-06, "loss": 0.032863426208496097, "num_tokens": 572072.0, "reward": 0.64200439453125, "reward_std": 0.5001831084489823, "rewards/JointRewardFunction/mean": 0.64200439453125, "rewards/JointRewardFunction/std": 0.5001831203699112, "step": 90, "step_time": 34.359502547597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.8, "completions/max_terminated_length": 768.8, "completions/mean_length": 475.65, "completions/mean_terminated_length": 475.65, "completions/min_length": 258.8, "completions/min_terminated_length": 258.8, "entropy": 1.330046895891428, "epoch": 0.8, "frac_reward_zero_std": 0.175, "grad_norm": 3.375, "kl": 0.24854949009604752, "learning_rate": 3.8125e-06, "loss": 0.0744681715965271, "num_tokens": 630790.0, "reward": 0.725848388671875, "reward_std": 0.4486017137765884, "rewards/JointRewardFunction/mean": 0.725848388671875, "rewards/JointRewardFunction/std": 0.4486017107963562, "step": 100, "step_time": 33.44759687739897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 695.2, "completions/max_terminated_length": 669.5, "completions/mean_length": 443.0125, "completions/mean_terminated_length": 436.7017883300781, "completions/min_length": 213.1, "completions/min_terminated_length": 213.1, "entropy": 1.3013170935213565, "epoch": 0.88, "frac_reward_zero_std": 0.275, "grad_norm": 3.984375, "kl": 0.3150750307366252, "learning_rate": 3.1875e-06, "loss": 0.020898757874965666, "num_tokens": 686775.0, "reward": 0.71988525390625, "reward_std": 0.515497374534607, "rewards/JointRewardFunction/mean": 0.71988525390625, "rewards/JointRewardFunction/std": 0.515497374534607, "step": 110, "step_time": 30.408145976099696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 757.0, "completions/max_terminated_length": 667.6, "completions/mean_length": 428.675, "completions/mean_terminated_length": 412.0500061035156, "completions/min_length": 201.6, "completions/min_terminated_length": 201.6, "entropy": 1.2176271453499794, "epoch": 0.96, "frac_reward_zero_std": 0.175, "grad_norm": 6.53125, "kl": 0.29011352979578076, "learning_rate": 2.5625e-06, "loss": 0.023420125246047974, "num_tokens": 741441.0, "reward": 0.738653564453125, "reward_std": 0.45226994156837463, "rewards/JointRewardFunction/mean": 0.738653564453125, "rewards/JointRewardFunction/std": 0.45226994901895523, "step": 120, "step_time": 32.81503715240033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.8, "completions/max_terminated_length": 645.8, "completions/mean_length": 429.1875, "completions/mean_terminated_length": 429.1875, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 1.313951000571251, "epoch": 1.04, "frac_reward_zero_std": 0.3, "grad_norm": 3.78125, "kl": 0.3095056655351073, "learning_rate": 1.9375e-06, "loss": 0.031660494208335874, "num_tokens": 796394.0, "reward": 0.7450439453125, "reward_std": 0.46799357831478117, "rewards/JointRewardFunction/mean": 0.7450439453125, "rewards/JointRewardFunction/std": 0.46799357831478117, "step": 130, "step_time": 28.36433180910135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.5, "completions/max_terminated_length": 661.5, "completions/mean_length": 416.2625, "completions/mean_terminated_length": 416.2625, "completions/min_length": 235.6, "completions/min_terminated_length": 235.6, "entropy": 1.3542070075869561, "epoch": 1.12, "frac_reward_zero_std": 0.2, "grad_norm": 2.328125, "kl": 0.3334041152149439, "learning_rate": 1.3125000000000001e-06, "loss": 0.022475141286849975, "num_tokens": 850343.0, "reward": 0.67725830078125, "reward_std": 0.3997616931796074, "rewards/JointRewardFunction/mean": 0.67725830078125, "rewards/JointRewardFunction/std": 0.399761700630188, "step": 140, "step_time": 28.77159399340053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.9, "completions/max_terminated_length": 703.9, "completions/mean_length": 422.925, "completions/mean_terminated_length": 422.925, "completions/min_length": 209.7, "completions/min_terminated_length": 209.7, "entropy": 1.2913881182670592, "epoch": 1.2, "frac_reward_zero_std": 0.2, "grad_norm": 3.734375, "kl": 0.31345505844801663, "learning_rate": 6.875000000000001e-07, "loss": 0.029654264450073242, "num_tokens": 904753.0, "reward": 0.79158935546875, "reward_std": 0.4540836468338966, "rewards/JointRewardFunction/mean": 0.79158935546875, "rewards/JointRewardFunction/std": 0.45408365726470945, "step": 150, "step_time": 30.629617839398996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.8, "completions/max_terminated_length": 645.8, "completions/mean_length": 411.0125, "completions/mean_terminated_length": 411.0125, "completions/min_length": 207.4, "completions/min_terminated_length": 207.4, "entropy": 1.2935365058481694, "epoch": 1.28, "frac_reward_zero_std": 0.175, "grad_norm": 3.171875, "kl": 0.3413598489947617, "learning_rate": 6.250000000000001e-08, "loss": -0.008248078823089599, "num_tokens": 958536.0, "reward": 0.695233154296875, "reward_std": 0.40624782145023347, "rewards/JointRewardFunction/mean": 0.695233154296875, "rewards/JointRewardFunction/std": 0.40624783337116244, "step": 160, "step_time": 28.084300646101475 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 958536, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }