{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05518763796909492, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0009054738155100494, "clip_ratio/high_mean": 0.00035781565529759975, "clip_ratio/low_mean": 0.00016371723540942185, "clip_ratio/low_min": 1.144509224104695e-05, "clip_ratio/region_mean": 0.0005215328827034682, "completions/clipped_ratio": 0.4879464626312256, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.8, "completions/mean_length": 802.8379760742188, "completions/mean_terminated_length": 591.0183349609375, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "entropy": 0.6993172228336334, "epoch": 0.005518763796909493, "frac_reward_zero_std": 0.648214316368103, "grad_norm": 0.123046875, "kl": 0.0009140914742602035, "learning_rate": 9.994100796397954e-06, "loss": 0.0096, "num_tokens": 2134981.0, "reward": 0.17656995058059693, "reward_std": 0.3300946533679962, "rewards/reward_fn/mean": 0.17656994760036468, "rewards/reward_fn/std": 0.3300946354866028, "step": 10, "step_time": 62.725463995942846 }, { "clip_ratio/high_max": 0.0009650078689446672, "clip_ratio/high_mean": 0.00040530616533942523, "clip_ratio/low_mean": 0.00019620073144324123, "clip_ratio/low_min": 2.9645057293237186e-05, "clip_ratio/region_mean": 0.0006015069084241986, "completions/clipped_ratio": 0.5308036029338836, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.4, "completions/mean_length": 814.8719116210938, "completions/mean_terminated_length": 578.806640625, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "entropy": 0.7009260475635528, "epoch": 0.011037527593818985, "frac_reward_zero_std": 0.6464286088943482, "grad_norm": 0.115234375, "kl": 0.0012107100861612707, "learning_rate": 9.889612861977855e-06, "loss": 0.009, "num_tokens": 4322090.0, "reward": 0.16465030908584594, "reward_std": 0.31232678294181826, "rewards/reward_fn/mean": 0.16465030312538148, "rewards/reward_fn/std": 0.31232677698135375, "step": 20, "step_time": 59.907306081056596 }, { "clip_ratio/high_max": 0.0007639058487256989, "clip_ratio/high_mean": 0.00030917047697585074, "clip_ratio/low_mean": 0.00016517131589353085, "clip_ratio/low_min": 3.0255078672780656e-05, "clip_ratio/region_mean": 0.0004743418045109138, "completions/clipped_ratio": 0.49821431636810304, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.6, "completions/mean_length": 810.8062744140625, "completions/mean_terminated_length": 601.0194946289063, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "entropy": 0.707157039642334, "epoch": 0.016556291390728478, "frac_reward_zero_std": 0.6964286088943481, "grad_norm": 0.08984375, "kl": 0.0015323497413191945, "learning_rate": 9.657180469054213e-06, "loss": 0.0056, "num_tokens": 6483648.0, "reward": 0.15746280252933503, "reward_std": 0.31524514555931094, "rewards/reward_fn/mean": 0.15746279954910278, "rewards/reward_fn/std": 0.315245121717453, "step": 30, "step_time": 69.35127711733803 }, { "clip_ratio/high_max": 0.0010893745173234492, "clip_ratio/high_mean": 0.0004552829515887424, "clip_ratio/low_mean": 0.00020492849944275804, "clip_ratio/low_min": 3.608180813898798e-05, "clip_ratio/region_mean": 0.0006602114357519895, "completions/clipped_ratio": 0.4924107313156128, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.4, "completions/mean_length": 801.3585205078125, "completions/mean_terminated_length": 584.6675537109375, "completions/min_length": 28.6, "completions/min_terminated_length": 28.6, "entropy": 0.7392709702253342, "epoch": 0.02207505518763797, "frac_reward_zero_std": 0.650000023841858, "grad_norm": 0.10205078125, "kl": 0.001778936560731381, "learning_rate": 9.302885579019626e-06, "loss": 0.0102, "num_tokens": 8664175.0, "reward": 0.1526116132736206, "reward_std": 0.3074562013149261, "rewards/reward_fn/mean": 0.15261160731315612, "rewards/reward_fn/std": 0.3074561834335327, "step": 40, "step_time": 66.20743731297553 }, { "clip_ratio/high_max": 0.0009445919014979154, "clip_ratio/high_mean": 0.00039524411113234237, "clip_ratio/low_mean": 0.00019868952222168446, "clip_ratio/low_min": 6.7668152041733265e-06, "clip_ratio/region_mean": 0.0005939336377196013, "completions/clipped_ratio": 0.4633928775787354, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.6, "completions/mean_length": 785.5151977539062, "completions/mean_terminated_length": 579.9617919921875, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "entropy": 0.7299870848655701, "epoch": 0.02759381898454746, "frac_reward_zero_std": 0.6625000357627868, "grad_norm": 0.13671875, "kl": 0.0018997497216332703, "learning_rate": 8.83599887835493e-06, "loss": 0.0106, "num_tokens": 10787441.0, "reward": 0.17087055444717408, "reward_std": 0.3202264249324799, "rewards/reward_fn/mean": 0.17087054550647734, "rewards/reward_fn/std": 0.3202264130115509, "step": 50, "step_time": 61.80385860861279 }, { "clip_ratio/high_max": 0.0008307943062391132, "clip_ratio/high_mean": 0.00035787131600955034, "clip_ratio/low_mean": 0.00023181447832030245, "clip_ratio/low_min": 2.61040790064726e-05, "clip_ratio/region_mean": 0.0005896858012420126, "completions/clipped_ratio": 0.4741071581840515, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.2, "completions/mean_length": 792.1500244140625, "completions/mean_terminated_length": 583.1341064453125, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "entropy": 0.8175096482038497, "epoch": 0.033112582781456956, "frac_reward_zero_std": 0.6428571581840515, "grad_norm": 0.1142578125, "kl": 0.0020997423096559944, "learning_rate": 8.268737196446264e-06, "loss": 0.01, "num_tokens": 12937373.0, "reward": 0.170543172955513, "reward_std": 0.30973091125488283, "rewards/reward_fn/mean": 0.17054316401481628, "rewards/reward_fn/std": 0.3097309172153473, "step": 60, "step_time": 81.64577775509096 }, { "clip_ratio/high_max": 0.0010142354993149638, "clip_ratio/high_mean": 0.0004649270667869132, "clip_ratio/low_mean": 0.0002056810655631125, "clip_ratio/low_min": 2.441328761051409e-05, "clip_ratio/region_mean": 0.0006706080996082165, "completions/clipped_ratio": 0.4513392984867096, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.2, "completions/mean_length": 780.0888671875, "completions/mean_terminated_length": 579.7443481445313, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "entropy": 0.8121535569429398, "epoch": 0.03863134657836645, "frac_reward_zero_std": 0.6232143044471741, "grad_norm": 0.1298828125, "kl": 0.002086066803894937, "learning_rate": 7.61594383291065e-06, "loss": 0.0068, "num_tokens": 15043532.0, "reward": 0.1762128084897995, "reward_std": 0.32508295178413393, "rewards/reward_fn/mean": 0.17621279656887054, "rewards/reward_fn/std": 0.3250829458236694, "step": 70, "step_time": 67.39459452703595 }, { "clip_ratio/high_max": 0.0011546267720405012, "clip_ratio/high_mean": 0.0004448194318683818, "clip_ratio/low_mean": 0.00022613786859437823, "clip_ratio/low_min": 3.512381881591864e-05, "clip_ratio/region_mean": 0.0006709573091939092, "completions/clipped_ratio": 0.45714287757873534, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.6, "completions/mean_length": 780.5415405273437, "completions/mean_terminated_length": 574.924560546875, "completions/min_length": 13.4, "completions/min_terminated_length": 13.4, "entropy": 0.8120243638753891, "epoch": 0.04415011037527594, "frac_reward_zero_std": 0.6267857432365418, "grad_norm": 0.125, "kl": 0.0022045036195777355, "learning_rate": 6.894700159171535e-06, "loss": 0.0074, "num_tokens": 17150985.0, "reward": 0.18558780550956727, "reward_std": 0.3320887923240662, "rewards/reward_fn/mean": 0.1855877935886383, "rewards/reward_fn/std": 0.3320887744426727, "step": 80, "step_time": 69.55923435157165 }, { "clip_ratio/high_max": 0.0010666975751519204, "clip_ratio/high_mean": 0.0004059886821778491, "clip_ratio/low_mean": 0.00018196528690168635, "clip_ratio/low_min": 1.4905737771186978e-05, "clip_ratio/region_mean": 0.0005879539618035779, "completions/clipped_ratio": 0.4683035910129547, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.8, "completions/mean_length": 786.5406616210937, "completions/mean_terminated_length": 577.4362426757813, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "entropy": 0.8206083297729492, "epoch": 0.04966887417218543, "frac_reward_zero_std": 0.6660714626312256, "grad_norm": 0.107421875, "kl": 0.0022262637387029825, "learning_rate": 6.123878657343648e-06, "loss": 0.0095, "num_tokens": 19265236.0, "reward": 0.16312500536441804, "reward_std": 0.3078223645687103, "rewards/reward_fn/mean": 0.16312500536441804, "rewards/reward_fn/std": 0.30782235860824586, "step": 90, "step_time": 63.29917922359891 }, { "clip_ratio/high_max": 0.001015995239140466, "clip_ratio/high_mean": 0.0005060771596617997, "clip_ratio/low_mean": 0.000221432710532099, "clip_ratio/low_min": 4.502671981754247e-05, "clip_ratio/region_mean": 0.0007275098643731326, "completions/clipped_ratio": 0.45044645071029665, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.4, "completions/mean_length": 771.9370849609375, "completions/mean_terminated_length": 565.00205078125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.8362967848777771, "epoch": 0.05518763796909492, "frac_reward_zero_std": 0.6035714626312256, "grad_norm": 0.09912109375, "kl": 0.0022062973876018077, "learning_rate": 5.323649091872179e-06, "loss": 0.0076, "num_tokens": 21324495.0, "reward": 0.19067709147930145, "reward_std": 0.32638007402420044, "rewards/reward_fn/mean": 0.19067708551883697, "rewards/reward_fn/std": 0.32638007402420044, "step": 100, "step_time": 47.039065402187404 } ], "logging_steps": 10, "max_steps": 200, "num_input_tokens_seen": 21324495, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }