{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.40053404539385845, "eval_steps": 10, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0058695650861962986, "eval_completions/max_length": 9628.91304347826, "eval_completions/max_terminated_length": 8156.0, "eval_completions/mean_length": 368.0647735595703, "eval_completions/mean_terminated_length": 311.3040877632473, "eval_completions/min_length": 7.565217391304348, "eval_completions/min_terminated_length": 7.565217391304348, "eval_entropy": 0.1015095182734987, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 0.0, "eval_reward": 0.48876625299453735, "eval_reward_std": 0.4879420218260392, "eval_rewards/TRLRewardAdapter/mean": 0.48876626854357513, "eval_rewards/TRLRewardAdapter/std": 0.4879420360793238, "eval_runtime": 1288.8509, "eval_samples_per_second": 3.543, "eval_sampling/importance_sampling_ratio/max": 1.8158701243607893, "eval_sampling/importance_sampling_ratio/mean": 0.8478950832201086, "eval_sampling/importance_sampling_ratio/min": 7.554826503316405e-45, "eval_sampling/sampling_logp_difference/max": 4.497523665428162, "eval_sampling/sampling_logp_difference/mean": 0.013395039526664692, "eval_steps_per_second": 0.018, "step": 0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9722.0, "completions/mean_length": 1250.2750244140625, "completions/mean_terminated_length": 958.3035278320312, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.13001826157172522, "epoch": 0.0006675567423230974, "frac_reward_zero_std": 0.36666667461395264, "grad_norm": 0.003963929983317199, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1615016.0, "reward": 0.46622881293296814, "reward_std": 0.46327731013298035, "rewards/TRLRewardAdapter/mean": 0.46622881293296814, "rewards/TRLRewardAdapter/std": 0.46327728033065796, "sampling/importance_sampling_ratio/max": 2.0905351638793945, "sampling/importance_sampling_ratio/mean": 0.5698453783988953, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.875, "sampling/sampling_logp_difference/mean": 0.017614439129829407, "step": 1, "step_time": 349.8505507160444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013541667722165585, "completions/max_length": 10000.0, "completions/max_terminated_length": 9620.0, "completions/mean_length": 1983.345947265625, "completions/mean_terminated_length": 1873.2967529296875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.15800216545661291, "epoch": 0.0013351134846461949, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.939362319777995e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 4002484.0, "reward": 0.5981148481369019, "reward_std": 0.420860230922699, "rewards/TRLRewardAdapter/mean": 0.5981148481369019, "rewards/TRLRewardAdapter/std": 0.4208602011203766, "sampling/importance_sampling_ratio/max": 1.067559003829956, "sampling/importance_sampling_ratio/mean": 0.0739017054438591, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.5, "sampling/sampling_logp_difference/mean": 0.018924588337540627, "step": 2, "step_time": 195.9517813299317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9542.0, "completions/mean_length": 1856.4542236328125, "completions/mean_terminated_length": 1735.9365234375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.16341984023650488, "epoch": 0.0020026702269692926, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.001521737309633894, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 6246152.0, "reward": 0.6672832369804382, "reward_std": 0.3837212026119232, "rewards/TRLRewardAdapter/mean": 0.6672831773757935, "rewards/TRLRewardAdapter/std": 0.3837212026119232, "sampling/importance_sampling_ratio/max": 0.6229729056358337, "sampling/importance_sampling_ratio/mean": 0.026337940245866776, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.75, "sampling/sampling_logp_difference/mean": 0.020159056410193443, "step": 3, "step_time": 310.4930986077525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9962.0, "completions/mean_length": 1677.27197265625, "completions/mean_terminated_length": 1380.9935302734375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.1610188608368238, "epoch": 0.0026702269692923898, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00026909690612077414, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 8283405.0, "reward": 0.7308850884437561, "reward_std": 0.3842007517814636, "rewards/TRLRewardAdapter/mean": 0.7308850884437561, "rewards/TRLRewardAdapter/std": 0.38420072197914124, "sampling/importance_sampling_ratio/max": 1.3412060737609863, "sampling/importance_sampling_ratio/mean": 0.05025748163461685, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.75, "sampling/sampling_logp_difference/mean": 0.02056610956788063, "step": 4, "step_time": 332.3665581381647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9797.0, "completions/mean_length": 2173.40625, "completions/mean_terminated_length": 1946.9130859375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.1481627250711123, "epoch": 0.0033377837116154874, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0001760998357281595, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 10783699.0, "reward": 0.6633889079093933, "reward_std": 0.3884478807449341, "rewards/TRLRewardAdapter/mean": 0.6633889079093933, "rewards/TRLRewardAdapter/std": 0.3884478807449341, "sampling/importance_sampling_ratio/max": 1.0769588947296143, "sampling/importance_sampling_ratio/mean": 0.03196404129266739, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.205452919006348, "sampling/sampling_logp_difference/mean": 0.018785811960697174, "step": 5, "step_time": 373.0113164811628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013541667722165585, "completions/max_length": 10000.0, "completions/max_terminated_length": 9959.0, "completions/mean_length": 2476.58349609375, "completions/mean_terminated_length": 2373.30517578125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "entropy": 0.16313111037015915, "epoch": 0.004005340453938585, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 8.253879711639936e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 13702563.0, "reward": 0.6100946664810181, "reward_std": 0.3951525390148163, "rewards/TRLRewardAdapter/mean": 0.6100946664810181, "rewards/TRLRewardAdapter/std": 0.39515256881713867, "sampling/importance_sampling_ratio/max": 0.3195042610168457, "sampling/importance_sampling_ratio/mean": 0.006963352672755718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.37500286102295, "sampling/sampling_logp_difference/mean": 0.01997915282845497, "step": 6, "step_time": 340.5359437910374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9880.0, "completions/mean_length": 2698.790771484375, "completions/mean_terminated_length": 2422.52880859375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.18421495457490286, "epoch": 0.004672897196261682, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.4062077062050323e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 16796122.0, "reward": 0.611805260181427, "reward_std": 0.390032023191452, "rewards/TRLRewardAdapter/mean": 0.6118052005767822, "rewards/TRLRewardAdapter/std": 0.3900320529937744, "sampling/importance_sampling_ratio/max": 0.7145169973373413, "sampling/importance_sampling_ratio/mean": 0.01436836551874876, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.0, "sampling/sampling_logp_difference/mean": 0.022083934396505356, "step": 7, "step_time": 362.56761700916104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333767950535, "completions/max_length": 10000.0, "completions/max_terminated_length": 9844.0, "completions/mean_length": 1520.1563720703125, "completions/mean_terminated_length": 1448.8970947265625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.12639248246947923, "epoch": 0.0053404539385847796, "frac_reward_zero_std": 0.0, "grad_norm": 8.54893146633734e-06, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 18704016.0, "reward": 0.7656512260437012, "reward_std": 0.31629979610443115, "rewards/TRLRewardAdapter/mean": 0.7656511664390564, "rewards/TRLRewardAdapter/std": 0.31629976630210876, "sampling/importance_sampling_ratio/max": 1.4734091758728027, "sampling/importance_sampling_ratio/mean": 0.06605617702007294, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.384025573730469, "sampling/sampling_logp_difference/mean": 0.016400350257754326, "step": 8, "step_time": 294.09013770811725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9995.0, "completions/mean_length": 2445.67822265625, "completions/mean_terminated_length": 2125.788330078125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.16306241105000177, "epoch": 0.006008010680907877, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.867191573872283e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 21534779.0, "reward": 0.6143709421157837, "reward_std": 0.41769421100616455, "rewards/TRLRewardAdapter/mean": 0.6143709421157837, "rewards/TRLRewardAdapter/std": 0.41769421100616455, "sampling/importance_sampling_ratio/max": 1.1997272968292236, "sampling/importance_sampling_ratio/mean": 0.03804197162389755, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.601531982421875, "sampling/sampling_logp_difference/mean": 0.020102281123399734, "step": 9, "step_time": 403.3070120601915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07083334028720856, "completions/max_length": 10000.0, "completions/max_terminated_length": 9965.0, "completions/mean_length": 3102.454345703125, "completions/mean_terminated_length": 2576.63232421875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "entropy": 0.18473305304845175, "epoch": 0.006675567423230975, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 3.4163132565526005e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 24949455.0, "reward": 0.550864577293396, "reward_std": 0.4099152982234955, "rewards/TRLRewardAdapter/mean": 0.5508645176887512, "rewards/TRLRewardAdapter/std": 0.4099152982234955, "sampling/importance_sampling_ratio/max": 1.446485161781311, "sampling/importance_sampling_ratio/mean": 0.011628195643424988, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.101531982421875, "sampling/sampling_logp_difference/mean": 0.02207454852759838, "step": 10, "step_time": 377.2115316420095 }, { "epoch": 0.006675567423230975, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.028043477350602978, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9730.608695652174, "eval_completions/mean_length": 2548.084281589674, "eval_completions/mean_terminated_length": 2332.8760508661685, "eval_completions/min_length": 167.47826086956522, "eval_completions/min_terminated_length": 167.47826086956522, "eval_entropy": 0.17597556826861008, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 24949455.0, "eval_reward": 0.6852899919385496, "eval_reward_std": 0.35346189270848816, "eval_rewards/TRLRewardAdapter/mean": 0.6852900126705999, "eval_rewards/TRLRewardAdapter/std": 0.35346189659574756, "eval_runtime": 1482.6659, "eval_samples_per_second": 3.08, "eval_sampling/importance_sampling_ratio/max": 0.7418420923792798, "eval_sampling/importance_sampling_ratio/mean": 0.029462316194954125, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 10.493550735971201, "eval_sampling/sampling_logp_difference/mean": 0.02119544035066729, "eval_steps_per_second": 0.016, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02500000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9938.0, "completions/mean_length": 2751.2021484375, "completions/mean_terminated_length": 2565.335693359375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.16327736030022302, "epoch": 0.007343124165554072, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 7.071755903277013e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 28079825.0, "reward": 0.6465809345245361, "reward_std": 0.3833412528038025, "rewards/TRLRewardAdapter/mean": 0.6465808749198914, "rewards/TRLRewardAdapter/std": 0.3833412230014801, "sampling/importance_sampling_ratio/max": 1.565128207206726, "sampling/importance_sampling_ratio/mean": 0.0151217607781291, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.675609588623047, "sampling/sampling_logp_difference/mean": 0.02042572572827339, "step": 11, "step_time": 285.10956294019707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04583333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9997.0, "completions/mean_length": 2812.61669921875, "completions/mean_terminated_length": 2467.371337890625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.16007833182811737, "epoch": 0.00801068090787717, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 6.132698409544181e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 31215617.0, "reward": 0.643985390663147, "reward_std": 0.37482520937919617, "rewards/TRLRewardAdapter/mean": 0.643985390663147, "rewards/TRLRewardAdapter/std": 0.37482523918151855, "sampling/importance_sampling_ratio/max": 0.7140831351280212, "sampling/importance_sampling_ratio/mean": 0.016925837844610214, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.0, "sampling/sampling_logp_difference/mean": 0.0196083877235651, "step": 12, "step_time": 370.41739310568664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9992.0, "completions/mean_length": 2703.39697265625, "completions/mean_terminated_length": 2451.789794921875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.150363989174366, "epoch": 0.008678237650200267, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 8.068492843481342e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 34309246.0, "reward": 0.6158199906349182, "reward_std": 0.3933789134025574, "rewards/TRLRewardAdapter/mean": 0.6158199310302734, "rewards/TRLRewardAdapter/std": 0.3933789134025574, "sampling/importance_sampling_ratio/max": 1.2546262741088867, "sampling/importance_sampling_ratio/mean": 0.027454596012830734, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.0, "sampling/sampling_logp_difference/mean": 0.01873675175011158, "step": 13, "step_time": 343.28990873193834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013541667722165585, "completions/max_length": 10000.0, "completions/max_terminated_length": 9624.0, "completions/mean_length": 1839.6292724609375, "completions/mean_terminated_length": 1727.6072998046875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.16461336115996042, "epoch": 0.009345794392523364, "frac_reward_zero_std": 0.0, "grad_norm": 8.430481391381056e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 36499898.0, "reward": 0.7455891966819763, "reward_std": 0.34534528851509094, "rewards/TRLRewardAdapter/mean": 0.7455891370773315, "rewards/TRLRewardAdapter/std": 0.34534528851509094, "sampling/importance_sampling_ratio/max": 0.4804428517818451, "sampling/importance_sampling_ratio/mean": 0.018482353538274765, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.0, "sampling/sampling_logp_difference/mean": 0.021020328626036644, "step": 14, "step_time": 300.4185423569288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9987.0, "completions/mean_length": 2474.071044921875, "completions/mean_terminated_length": 2297.556640625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.16982072591781616, "epoch": 0.010013351134846462, "frac_reward_zero_std": 0.0, "grad_norm": 7.293729803626923e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 39322238.0, "reward": 0.7068325281143188, "reward_std": 0.33907824754714966, "rewards/TRLRewardAdapter/mean": 0.7068325281143188, "rewards/TRLRewardAdapter/std": 0.33907824754714966, "sampling/importance_sampling_ratio/max": 1.1466498374938965, "sampling/importance_sampling_ratio/mean": 0.03342071920633316, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.5, "sampling/sampling_logp_difference/mean": 0.021197987720370293, "step": 15, "step_time": 325.7601255310001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9296.0, "completions/mean_length": 1522.5438232421875, "completions/mean_terminated_length": 1295.873779296875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.15955317517121634, "epoch": 0.010680907877169559, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 8.448982229059143e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 41200104.0, "reward": 0.8010233640670776, "reward_std": 0.2865890562534332, "rewards/TRLRewardAdapter/mean": 0.8010233044624329, "rewards/TRLRewardAdapter/std": 0.28658902645111084, "sampling/importance_sampling_ratio/max": 1.0263442993164062, "sampling/importance_sampling_ratio/mean": 0.04621851444244385, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.5, "sampling/sampling_logp_difference/mean": 0.019944362342357635, "step": 16, "step_time": 270.0631648580311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9980.0, "completions/mean_length": 2042.39697265625, "completions/mean_terminated_length": 1950.1590576171875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.1565666620930036, "epoch": 0.011348464619492658, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.7796010189351604e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 43625765.0, "reward": 0.7884296774864197, "reward_std": 0.25384098291397095, "rewards/TRLRewardAdapter/mean": 0.7884296178817749, "rewards/TRLRewardAdapter/std": 0.25384098291397095, "sampling/importance_sampling_ratio/max": 1.861295461654663, "sampling/importance_sampling_ratio/mean": 0.017603568732738495, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.612419128417969, "sampling/sampling_logp_difference/mean": 0.019831787794828415, "step": 17, "step_time": 347.98629364511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9900.0, "completions/max_terminated_length": 9900.0, "completions/mean_length": 1444.2657470703125, "completions/mean_terminated_length": 1444.2657470703125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.15096635619799295, "epoch": 0.012016021361815754, "frac_reward_zero_std": 0.0, "grad_norm": 7.570612011506107e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 45440676.0, "reward": 0.8326675891876221, "reward_std": 0.22673535346984863, "rewards/TRLRewardAdapter/mean": 0.8326675295829773, "rewards/TRLRewardAdapter/std": 0.22673535346984863, "sampling/importance_sampling_ratio/max": 0.968313992023468, "sampling/importance_sampling_ratio/mean": 0.02924155816435814, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.5, "sampling/sampling_logp_difference/mean": 0.019063441082835197, "step": 18, "step_time": 133.0458398879273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9047.0, "completions/mean_length": 1453.857421875, "completions/mean_terminated_length": 1409.1131591796875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.16292286415894827, "epoch": 0.012683578104138851, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.1878002922909265e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 47252155.0, "reward": 0.7933103442192078, "reward_std": 0.30603086948394775, "rewards/TRLRewardAdapter/mean": 0.793310284614563, "rewards/TRLRewardAdapter/std": 0.30603086948394775, "sampling/importance_sampling_ratio/max": 0.7120311260223389, "sampling/importance_sampling_ratio/mean": 0.03436170145869255, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.0568528175354, "sampling/sampling_logp_difference/mean": 0.020125074312090874, "step": 19, "step_time": 239.29895686078817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9918.0, "completions/mean_length": 2134.92919921875, "completions/mean_terminated_length": 1837.3319091796875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.1405639722943306, "epoch": 0.01335113484646195, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 7.971985035081557e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 49744727.0, "reward": 0.7721226215362549, "reward_std": 0.31692877411842346, "rewards/TRLRewardAdapter/mean": 0.7721225619316101, "rewards/TRLRewardAdapter/std": 0.31692877411842346, "sampling/importance_sampling_ratio/max": 0.7781100273132324, "sampling/importance_sampling_ratio/mean": 0.03497695177793503, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.213587284088135, "sampling/sampling_logp_difference/mean": 0.018059207126498222, "step": 20, "step_time": 397.5263723449316 }, { "epoch": 0.01335113484646195, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0330434774413057, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9637.652173913044, "eval_completions/mean_length": 2604.1188646399455, "eval_completions/mean_terminated_length": 2351.3665824558425, "eval_completions/min_length": 176.2608695652174, "eval_completions/min_terminated_length": 176.2608695652174, "eval_entropy": 0.1739662607078967, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 49744727.0, "eval_reward": 0.6817569836326267, "eval_reward_std": 0.3527678212393885, "eval_rewards/TRLRewardAdapter/mean": 0.6817569888156393, "eval_rewards/TRLRewardAdapter/std": 0.35276782253514166, "eval_runtime": 1474.8377, "eval_samples_per_second": 3.096, "eval_sampling/importance_sampling_ratio/max": 0.7803603579168734, "eval_sampling/importance_sampling_ratio/mean": 0.027462159728874332, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 16.37684584700543, "eval_sampling/sampling_logp_difference/mean": 0.021047963396362637, "eval_steps_per_second": 0.016, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9995.0, "completions/mean_length": 2811.086669921875, "completions/mean_terminated_length": 2245.666259765625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.15907751520474753, "epoch": 0.014018691588785047, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.7370032066204033e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 52867722.0, "reward": 0.6375277638435364, "reward_std": 0.40191683173179626, "rewards/TRLRewardAdapter/mean": 0.6375277042388916, "rewards/TRLRewardAdapter/std": 0.40191683173179626, "sampling/importance_sampling_ratio/max": 1.454261064529419, "sampling/importance_sampling_ratio/mean": 0.02419554814696312, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.0, "sampling/sampling_logp_difference/mean": 0.01970238797366619, "step": 21, "step_time": 392.6877905498259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9924.0, "completions/mean_length": 1882.697998046875, "completions/mean_terminated_length": 1683.4471435546875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.14062569538752237, "epoch": 0.014686248331108143, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 4.558253878958991e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 55109544.0, "reward": 0.7195169925689697, "reward_std": 0.3621172308921814, "rewards/TRLRewardAdapter/mean": 0.719516932964325, "rewards/TRLRewardAdapter/std": 0.3621172308921814, "sampling/importance_sampling_ratio/max": 0.9061365723609924, "sampling/importance_sampling_ratio/mean": 0.04393158107995987, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.705451965332031, "sampling/sampling_logp_difference/mean": 0.017876289784908295, "step": 22, "step_time": 266.21505853009876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9480.0, "completions/mean_length": 1694.966796875, "completions/mean_terminated_length": 1686.306640625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.14270612597465515, "epoch": 0.015353805073431242, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 1.985058673320514e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 57172840.0, "reward": 0.729175329208374, "reward_std": 0.34734463691711426, "rewards/TRLRewardAdapter/mean": 0.729175329208374, "rewards/TRLRewardAdapter/std": 0.34734463691711426, "sampling/importance_sampling_ratio/max": 0.7115302085876465, "sampling/importance_sampling_ratio/mean": 0.020532799884676933, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.0, "sampling/sampling_logp_difference/mean": 0.018028447404503822, "step": 23, "step_time": 180.27268743491732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9344.0, "completions/max_terminated_length": 9344.0, "completions/mean_length": 1484.5167236328125, "completions/mean_terminated_length": 1484.5167236328125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.15847162157297134, "epoch": 0.01602136181575434, "frac_reward_zero_std": 0.0, "grad_norm": 0.0001028474995219759, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 59037528.0, "reward": 0.7915544509887695, "reward_std": 0.2957701086997986, "rewards/TRLRewardAdapter/mean": 0.7915543913841248, "rewards/TRLRewardAdapter/std": 0.2957700788974762, "sampling/importance_sampling_ratio/max": 2.068155527114868, "sampling/importance_sampling_ratio/mean": 0.041101083159446716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.384025573730469, "sampling/sampling_logp_difference/mean": 0.020195234566926956, "step": 24, "step_time": 153.1247306568548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9968.0, "completions/mean_length": 3015.501220703125, "completions/mean_terminated_length": 2743.377685546875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.1667938530445099, "epoch": 0.016688918558077435, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 4.893860387088866e-06, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 62414105.0, "reward": 0.5980639457702637, "reward_std": 0.41652798652648926, "rewards/TRLRewardAdapter/mean": 0.5980638861656189, "rewards/TRLRewardAdapter/std": 0.41652798652648926, "sampling/importance_sampling_ratio/max": 0.685901403427124, "sampling/importance_sampling_ratio/mean": 0.02078217826783657, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.45545196533203, "sampling/sampling_logp_difference/mean": 0.02037177048623562, "step": 25, "step_time": 288.5315183630446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9735.0, "completions/mean_length": 1771.1802978515625, "completions/mean_terminated_length": 1450.5767822265625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.11857244869073232, "epoch": 0.017356475300400534, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.3652553532197679e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 64551846.0, "reward": 0.7697377800941467, "reward_std": 0.337533563375473, "rewards/TRLRewardAdapter/mean": 0.7697377800941467, "rewards/TRLRewardAdapter/std": 0.33753353357315063, "sampling/importance_sampling_ratio/max": 0.7082079648971558, "sampling/importance_sampling_ratio/mean": 0.0345943346619606, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.5, "sampling/sampling_logp_difference/mean": 0.015839187428355217, "step": 26, "step_time": 378.49634133896325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9775.0, "completions/mean_length": 2556.24169921875, "completions/mean_terminated_length": 2422.048828125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.16674817353487015, "epoch": 0.018024032042723633, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 6.3078272732644785e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 67498254.0, "reward": 0.6306492686271667, "reward_std": 0.39549604058265686, "rewards/TRLRewardAdapter/mean": 0.6306492686271667, "rewards/TRLRewardAdapter/std": 0.3954960107803345, "sampling/importance_sampling_ratio/max": 1.0434380769729614, "sampling/importance_sampling_ratio/mean": 0.014426070265471935, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.1340217590332, "sampling/sampling_logp_difference/mean": 0.020697642117738724, "step": 27, "step_time": 282.7175688589923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9967.0, "completions/mean_length": 2606.767822265625, "completions/mean_terminated_length": 2318.7197265625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.15262326101462045, "epoch": 0.018691588785046728, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 4.3801614561624256e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 70447375.0, "reward": 0.6429911851882935, "reward_std": 0.39121928811073303, "rewards/TRLRewardAdapter/mean": 0.6429911255836487, "rewards/TRLRewardAdapter/std": 0.3912193179130554, "sampling/importance_sampling_ratio/max": 0.8509641289710999, "sampling/importance_sampling_ratio/mean": 0.028081044554710388, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.878543853759766, "sampling/sampling_logp_difference/mean": 0.019199039787054062, "step": 28, "step_time": 357.09570511907805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9176.0, "completions/mean_length": 1643.8365478515625, "completions/mean_terminated_length": 1626.3914794921875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.15326853096485138, "epoch": 0.019359145527369826, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.000247012572821296, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 72458514.0, "reward": 0.7526599764823914, "reward_std": 0.3449936807155609, "rewards/TRLRewardAdapter/mean": 0.7526599168777466, "rewards/TRLRewardAdapter/std": 0.3449936807155609, "sampling/importance_sampling_ratio/max": 0.7676636576652527, "sampling/importance_sampling_ratio/mean": 0.03247120976448059, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.56729507446289, "sampling/sampling_logp_difference/mean": 0.01978626661002636, "step": 29, "step_time": 223.34505005995743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9978.0, "completions/mean_length": 2350.531494140625, "completions/mean_terminated_length": 2069.665283203125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.14331835011641184, "epoch": 0.020026702269692925, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.3437896643192213e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 75243280.0, "reward": 0.7099708914756775, "reward_std": 0.3590555787086487, "rewards/TRLRewardAdapter/mean": 0.7099708318710327, "rewards/TRLRewardAdapter/std": 0.35905560851097107, "sampling/importance_sampling_ratio/max": 0.6232369542121887, "sampling/importance_sampling_ratio/mean": 0.023311033844947815, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.99374771118164, "sampling/sampling_logp_difference/mean": 0.017780710011720657, "step": 30, "step_time": 361.775055492064 }, { "epoch": 0.020026702269692925, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.03239130354283944, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9772.260869565218, "eval_completions/mean_length": 2606.1256050441575, "eval_completions/mean_terminated_length": 2358.0577339504075, "eval_completions/min_length": 169.82608695652175, "eval_completions/min_terminated_length": 169.82608695652175, "eval_entropy": 0.17186743539312613, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 75243280.0, "eval_reward": 0.6803432029226552, "eval_reward_std": 0.3560306883376578, "eval_rewards/TRLRewardAdapter/mean": 0.6803432288377181, "eval_rewards/TRLRewardAdapter/std": 0.3560306987036829, "eval_runtime": 1472.6501, "eval_samples_per_second": 3.101, "eval_sampling/importance_sampling_ratio/max": 0.9402208820633267, "eval_sampling/importance_sampling_ratio/mean": 0.03124737205064815, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 8.592522258343903, "eval_sampling/sampling_logp_difference/mean": 0.020907699494906094, "eval_steps_per_second": 0.016, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9981.0, "completions/mean_length": 3137.97412109375, "completions/mean_terminated_length": 2816.199462890625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.16550657898187637, "epoch": 0.02069425901201602, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 6.4489976586773445e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 78776503.0, "reward": 0.6009894609451294, "reward_std": 0.3955569863319397, "rewards/TRLRewardAdapter/mean": 0.6009894013404846, "rewards/TRLRewardAdapter/std": 0.3955569863319397, "sampling/importance_sampling_ratio/max": 0.5240107178688049, "sampling/importance_sampling_ratio/mean": 0.0119780907407403, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.25, "sampling/sampling_logp_difference/mean": 0.020377689972519875, "step": 31, "step_time": 343.00474963581655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9627.0, "completions/mean_length": 1861.5615234375, "completions/mean_terminated_length": 1853.0750732421875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.14779063314199448, "epoch": 0.021361815754339118, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.468140486117813e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 80980978.0, "reward": 0.7782094478607178, "reward_std": 0.2788294553756714, "rewards/TRLRewardAdapter/mean": 0.7782094478607178, "rewards/TRLRewardAdapter/std": 0.278829425573349, "sampling/importance_sampling_ratio/max": 0.6572726964950562, "sampling/importance_sampling_ratio/mean": 0.022478308528661728, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.75, "sampling/sampling_logp_difference/mean": 0.018854469060897827, "step": 32, "step_time": 255.422315533855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05104167014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9926.0, "completions/mean_length": 2384.95947265625, "completions/mean_terminated_length": 1975.368896484375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.14300387849410376, "epoch": 0.022029372496662217, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.0219617496418375e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 83763883.0, "reward": 0.7019184827804565, "reward_std": 0.36701837182044983, "rewards/TRLRewardAdapter/mean": 0.7019184827804565, "rewards/TRLRewardAdapter/std": 0.36701837182044983, "sampling/importance_sampling_ratio/max": 1.499758243560791, "sampling/importance_sampling_ratio/mean": 0.06447918713092804, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.75, "sampling/sampling_logp_difference/mean": 0.01836785487830639, "step": 33, "step_time": 358.00598470889963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9936.0, "completions/mean_length": 2744.158447265625, "completions/mean_terminated_length": 2526.171630859375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.16053040077288946, "epoch": 0.022696929238985315, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 9.067959481784585e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 86873891.0, "reward": 0.6393517255783081, "reward_std": 0.40533024072647095, "rewards/TRLRewardAdapter/mean": 0.6393517255783081, "rewards/TRLRewardAdapter/std": 0.40533021092414856, "sampling/importance_sampling_ratio/max": 1.1513885259628296, "sampling/importance_sampling_ratio/mean": 0.043466292321681976, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.20545196533203, "sampling/sampling_logp_difference/mean": 0.01987256109714508, "step": 34, "step_time": 309.8196011830587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 9354.0, "completions/mean_length": 1843.533447265625, "completions/mean_terminated_length": 1792.2347412109375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.15330085655053458, "epoch": 0.02336448598130841, "frac_reward_zero_std": 0.0, "grad_norm": 1.3118477236626115e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 89097827.0, "reward": 0.7772722244262695, "reward_std": 0.2914155125617981, "rewards/TRLRewardAdapter/mean": 0.7772721648216248, "rewards/TRLRewardAdapter/std": 0.2914155125617981, "sampling/importance_sampling_ratio/max": 0.8629078269004822, "sampling/importance_sampling_ratio/mean": 0.03069988824427128, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.0, "sampling/sampling_logp_difference/mean": 0.01882343553006649, "step": 35, "step_time": 177.82547514908947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9696.0, "completions/mean_length": 1837.0552978515625, "completions/mean_terminated_length": 1794.3172607421875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.15561727186044058, "epoch": 0.02403204272363151, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.319521792394332e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 91277400.0, "reward": 0.7288545370101929, "reward_std": 0.34979894757270813, "rewards/TRLRewardAdapter/mean": 0.7288544774055481, "rewards/TRLRewardAdapter/std": 0.34979891777038574, "sampling/importance_sampling_ratio/max": 1.118497610092163, "sampling/importance_sampling_ratio/mean": 0.01759815216064453, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.5, "sampling/sampling_logp_difference/mean": 0.01941339112818241, "step": 36, "step_time": 239.22541462804656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08645834028720856, "completions/max_length": 10000.0, "completions/max_terminated_length": 9984.0, "completions/mean_length": 2795.52197265625, "completions/mean_terminated_length": 2113.68408203125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.1540344258149465, "epoch": 0.024699599465954607, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.2511966425095354e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 94367309.0, "reward": 0.652622640132904, "reward_std": 0.39773157238960266, "rewards/TRLRewardAdapter/mean": 0.6526225805282593, "rewards/TRLRewardAdapter/std": 0.3977315425872803, "sampling/importance_sampling_ratio/max": 1.2285096645355225, "sampling/importance_sampling_ratio/mean": 0.03183310478925705, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.5, "sampling/sampling_logp_difference/mean": 0.019448338076472282, "step": 37, "step_time": 378.83378495124634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05000000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 8997.0, "completions/mean_length": 1632.0718994140625, "completions/mean_terminated_length": 1191.6546630859375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.14959441870450974, "epoch": 0.025367156208277702, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.795074141172063e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 96343218.0, "reward": 0.7979875206947327, "reward_std": 0.2945046126842499, "rewards/TRLRewardAdapter/mean": 0.7979875206947327, "rewards/TRLRewardAdapter/std": 0.2945046126842499, "sampling/importance_sampling_ratio/max": 1.0672026872634888, "sampling/importance_sampling_ratio/mean": 0.038581229746341705, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.101529598236084, "sampling/sampling_logp_difference/mean": 0.018767818808555603, "step": 38, "step_time": 392.2913121720776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06145833805203438, "completions/max_length": 10000.0, "completions/max_terminated_length": 9840.0, "completions/mean_length": 2673.05224609375, "completions/mean_terminated_length": 2193.26318359375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.16396502405405045, "epoch": 0.0260347129506008, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 4.212446028287396e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 99401252.0, "reward": 0.6855279803276062, "reward_std": 0.36191561818122864, "rewards/TRLRewardAdapter/mean": 0.6855279207229614, "rewards/TRLRewardAdapter/std": 0.36191561818122864, "sampling/importance_sampling_ratio/max": 1.2195574045181274, "sampling/importance_sampling_ratio/mean": 0.030840326100587845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.5, "sampling/sampling_logp_difference/mean": 0.020518174394965172, "step": 39, "step_time": 404.7236966447672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9971.0, "completions/mean_length": 2524.9189453125, "completions/mean_terminated_length": 2349.59716796875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.15453822165727615, "epoch": 0.0267022696929239, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 8.77127940402416e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 102324470.0, "reward": 0.7294073700904846, "reward_std": 0.3090914785861969, "rewards/TRLRewardAdapter/mean": 0.7294073104858398, "rewards/TRLRewardAdapter/std": 0.3090914785861969, "sampling/importance_sampling_ratio/max": 0.9619219303131104, "sampling/importance_sampling_ratio/mean": 0.024523980915546417, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.5, "sampling/sampling_logp_difference/mean": 0.019379859790205956, "step": 40, "step_time": 343.77263764292 }, { "epoch": 0.0267022696929239, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.038260868224112884, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9677.347826086956, "eval_completions/mean_length": 2651.9364385190215, "eval_completions/mean_terminated_length": 2359.996640412704, "eval_completions/min_length": 162.91304347826087, "eval_completions/min_terminated_length": 162.91304347826087, "eval_entropy": 0.16913557376550592, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 102324470.0, "eval_reward": 0.6783239141754482, "eval_reward_std": 0.3587090191633805, "eval_rewards/TRLRewardAdapter/mean": 0.6783239349074985, "eval_rewards/TRLRewardAdapter/std": 0.35870902045913367, "eval_runtime": 1477.3676, "eval_samples_per_second": 3.091, "eval_sampling/importance_sampling_ratio/max": 0.7940938731898433, "eval_sampling/importance_sampling_ratio/mean": 0.03470485568370508, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.202882041101871, "eval_sampling/sampling_logp_difference/mean": 0.02062327843969283, "eval_steps_per_second": 0.016, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9885.0, "completions/max_terminated_length": 9885.0, "completions/mean_length": 1731.3271484375, "completions/mean_terminated_length": 1731.3271484375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.14613804717858633, "epoch": 0.027369826435246995, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.772076502732586e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 104411600.0, "reward": 0.7801780104637146, "reward_std": 0.30260586738586426, "rewards/TRLRewardAdapter/mean": 0.7801779508590698, "rewards/TRLRewardAdapter/std": 0.30260586738586426, "sampling/importance_sampling_ratio/max": 0.7030460238456726, "sampling/importance_sampling_ratio/mean": 0.029588475823402405, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.139068126678467, "sampling/sampling_logp_difference/mean": 0.01869039051234722, "step": 41, "step_time": 182.65992252109572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250002793967724, "completions/max_length": 10000.0, "completions/max_terminated_length": 9961.0, "completions/mean_length": 1517.7896728515625, "completions/mean_terminated_length": 1491.1995849609375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.132563183705012, "epoch": 0.028037383177570093, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.8804255798418503e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 106283814.0, "reward": 0.7547710537910461, "reward_std": 0.323283851146698, "rewards/TRLRewardAdapter/mean": 0.7547710537910461, "rewards/TRLRewardAdapter/std": 0.3232838213443756, "sampling/importance_sampling_ratio/max": 1.0949538946151733, "sampling/importance_sampling_ratio/mean": 0.046030230820178986, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.889068126678467, "sampling/sampling_logp_difference/mean": 0.017126411199569702, "step": 42, "step_time": 230.28195218008477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9796.0, "completions/mean_length": 1967.4178466796875, "completions/mean_terminated_length": 1865.7393798828125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.16157567501068115, "epoch": 0.02870493991989319, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.1981105564905417e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 108565335.0, "reward": 0.7106693387031555, "reward_std": 0.36234530806541443, "rewards/TRLRewardAdapter/mean": 0.7106693387031555, "rewards/TRLRewardAdapter/std": 0.36234527826309204, "sampling/importance_sampling_ratio/max": 0.9503229856491089, "sampling/importance_sampling_ratio/mean": 0.03879234939813614, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.75, "sampling/sampling_logp_difference/mean": 0.020331403240561485, "step": 43, "step_time": 287.2145721669076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666883975267, "completions/max_length": 10000.0, "completions/max_terminated_length": 9949.0, "completions/mean_length": 2188.185546875, "completions/mean_terminated_length": 2155.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.14538385222355524, "epoch": 0.029372496662216287, "frac_reward_zero_std": 0.0, "grad_norm": 5.555480933801738e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 111128905.0, "reward": 0.6961753368377686, "reward_std": 0.33142733573913574, "rewards/TRLRewardAdapter/mean": 0.6961753368377686, "rewards/TRLRewardAdapter/std": 0.33142733573913574, "sampling/importance_sampling_ratio/max": 0.7592278718948364, "sampling/importance_sampling_ratio/mean": 0.018584245815873146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.889067649841309, "sampling/sampling_logp_difference/mean": 0.018225451931357384, "step": 44, "step_time": 222.636046584812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03854166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9962.0, "completions/mean_length": 2353.253173828125, "completions/mean_terminated_length": 2046.720458984375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.13920400043328604, "epoch": 0.030040053404539385, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.0677616034317224e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 113820508.0, "reward": 0.7413713932037354, "reward_std": 0.3141540586948395, "rewards/TRLRewardAdapter/mean": 0.7413713335990906, "rewards/TRLRewardAdapter/std": 0.3141540586948395, "sampling/importance_sampling_ratio/max": 1.1752647161483765, "sampling/importance_sampling_ratio/mean": 0.04126746580004692, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.375, "sampling/sampling_logp_difference/mean": 0.017539475113153458, "step": 45, "step_time": 362.47815403400455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9915.0, "completions/mean_length": 1502.8365478515625, "completions/mean_terminated_length": 1238.15576171875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.13785753895839056, "epoch": 0.030707610146862484, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0002536990387100454, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 115673759.0, "reward": 0.7178462743759155, "reward_std": 0.3932344615459442, "rewards/TRLRewardAdapter/mean": 0.7178462147712708, "rewards/TRLRewardAdapter/std": 0.3932344317436218, "sampling/importance_sampling_ratio/max": 0.5791277289390564, "sampling/importance_sampling_ratio/mean": 0.03743075579404831, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.344983100891113, "sampling/sampling_logp_difference/mean": 0.018171459436416626, "step": 46, "step_time": 398.182297894964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10208334028720856, "completions/max_length": 10000.0, "completions/max_terminated_length": 9981.0, "completions/mean_length": 2515.656494140625, "completions/mean_terminated_length": 1664.7679443359375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.1233983871837457, "epoch": 0.03137516688918558, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 5.252339845388211e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 118596853.0, "reward": 0.7348817586898804, "reward_std": 0.3557051718235016, "rewards/TRLRewardAdapter/mean": 0.7348816990852356, "rewards/TRLRewardAdapter/std": 0.3557051718235016, "sampling/importance_sampling_ratio/max": 0.9686952829360962, "sampling/importance_sampling_ratio/mean": 0.0441688671708107, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.809269905090332, "sampling/sampling_logp_difference/mean": 0.015981197357177734, "step": 47, "step_time": 399.4955422991188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9956.0, "completions/mean_length": 2785.502197265625, "completions/mean_terminated_length": 2488.158447265625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1528485044836998, "epoch": 0.03204272363150868, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001471467823581284, "learning_rate": 5e-06, "loss": 0.0086, "num_tokens": 121808695.0, "reward": 0.6584293842315674, "reward_std": 0.37855854630470276, "rewards/TRLRewardAdapter/mean": 0.6584293246269226, "rewards/TRLRewardAdapter/std": 0.37855857610702515, "sampling/importance_sampling_ratio/max": 1.2035459280014038, "sampling/importance_sampling_ratio/mean": 0.02604484185576439, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.0, "sampling/sampling_logp_difference/mean": 0.01925470121204853, "step": 48, "step_time": 317.3244050971698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9999.0, "completions/mean_length": 2059.998046875, "completions/mean_terminated_length": 1830.222900390625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.14659801870584488, "epoch": 0.03271028037383177, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.4577237709802798e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 124274165.0, "reward": 0.7415719628334045, "reward_std": 0.3256860673427582, "rewards/TRLRewardAdapter/mean": 0.7415719628334045, "rewards/TRLRewardAdapter/std": 0.3256860673427582, "sampling/importance_sampling_ratio/max": 0.7680654525756836, "sampling/importance_sampling_ratio/mean": 0.022542716935276985, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.5, "sampling/sampling_logp_difference/mean": 0.018568826839327812, "step": 49, "step_time": 262.9577568358509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9965.0, "completions/mean_length": 2897.4052734375, "completions/mean_terminated_length": 2636.61865234375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.16663504391908646, "epoch": 0.03337783711615487, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.2715910014820309e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 127594362.0, "reward": 0.6132352352142334, "reward_std": 0.39572495222091675, "rewards/TRLRewardAdapter/mean": 0.6132352352142334, "rewards/TRLRewardAdapter/std": 0.39572492241859436, "sampling/importance_sampling_ratio/max": 2.473289966583252, "sampling/importance_sampling_ratio/mean": 0.050576139241456985, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.0, "sampling/sampling_logp_difference/mean": 0.02066454105079174, "step": 50, "step_time": 401.5253677498549 }, { "epoch": 0.03337783711615487, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.03478260803967714, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9717.304347826086, "eval_completions/mean_length": 2620.2503715183425, "eval_completions/mean_terminated_length": 2354.2954950747285, "eval_completions/min_length": 163.7826086956522, "eval_completions/min_terminated_length": 163.7826086956522, "eval_entropy": 0.16572054961453314, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 127594362.0, "eval_reward": 0.6820138148639513, "eval_reward_std": 0.3564222338406936, "eval_rewards/TRLRewardAdapter/mean": 0.6820138304129891, "eval_rewards/TRLRewardAdapter/std": 0.3564222364321999, "eval_runtime": 1477.3333, "eval_samples_per_second": 3.091, "eval_sampling/importance_sampling_ratio/max": 0.794634029917095, "eval_sampling/importance_sampling_ratio/mean": 0.036990652832648026, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 14.93082186450129, "eval_sampling/sampling_logp_difference/mean": 0.02033352293074131, "eval_steps_per_second": 0.016, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 10000.0, "completions/max_terminated_length": 9917.0, "completions/mean_length": 1558.377197265625, "completions/mean_terminated_length": 1469.517822265625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.14129466811815897, "epoch": 0.03404539385847797, "frac_reward_zero_std": 0.0, "grad_norm": 2.487918331133647e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 129513476.0, "reward": 0.7855857014656067, "reward_std": 0.3084965944290161, "rewards/TRLRewardAdapter/mean": 0.7855856418609619, "rewards/TRLRewardAdapter/std": 0.3084965944290161, "sampling/importance_sampling_ratio/max": 0.8416159749031067, "sampling/importance_sampling_ratio/mean": 0.04249129444360733, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.101531982421875, "sampling/sampling_logp_difference/mean": 0.01827220991253853, "step": 51, "step_time": 335.93951715307776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9671.0, "completions/mean_length": 1791.643798828125, "completions/mean_terminated_length": 1608.0703125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.15238911658525467, "epoch": 0.03471295060080107, "frac_reward_zero_std": 0.0, "grad_norm": 0.00010058242674977796, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 131651406.0, "reward": 0.7073073387145996, "reward_std": 0.36141082644462585, "rewards/TRLRewardAdapter/mean": 0.7073072791099548, "rewards/TRLRewardAdapter/std": 0.36141085624694824, "sampling/importance_sampling_ratio/max": 1.0503010749816895, "sampling/importance_sampling_ratio/mean": 0.04531528800725937, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.018336296081543, "sampling/sampling_logp_difference/mean": 0.019222868606448174, "step": 52, "step_time": 334.9349716779543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9667.0, "completions/mean_length": 1537.0479736328125, "completions/mean_terminated_length": 1411.8033447265625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.14606783787409464, "epoch": 0.035380507343124167, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00027884995844685233, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 133539804.0, "reward": 0.76200270652771, "reward_std": 0.34694746136665344, "rewards/TRLRewardAdapter/mean": 0.76200270652771, "rewards/TRLRewardAdapter/std": 0.34694749116897583, "sampling/importance_sampling_ratio/max": 0.6333364248275757, "sampling/importance_sampling_ratio/mean": 0.031054774299263954, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.0, "sampling/sampling_logp_difference/mean": 0.01890990138053894, "step": 53, "step_time": 281.68120014993474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9958.0, "completions/mean_length": 2412.393798828125, "completions/mean_terminated_length": 2133.799072265625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.15708432098229727, "epoch": 0.036048064085447265, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 8.875007042733977e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 136328598.0, "reward": 0.695740818977356, "reward_std": 0.3620569407939911, "rewards/TRLRewardAdapter/mean": 0.6957407593727112, "rewards/TRLRewardAdapter/std": 0.3620569109916687, "sampling/importance_sampling_ratio/max": 1.2548407316207886, "sampling/importance_sampling_ratio/mean": 0.02836095169186592, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.78517150878906, "sampling/sampling_logp_difference/mean": 0.01961705833673477, "step": 54, "step_time": 369.97554254811257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 8624.0, "completions/mean_length": 1453.027099609375, "completions/mean_terminated_length": 1444.11474609375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.15872382124265036, "epoch": 0.036715620827770364, "frac_reward_zero_std": 0.0, "grad_norm": 6.502426198048743e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 138168464.0, "reward": 0.80043625831604, "reward_std": 0.2794245481491089, "rewards/TRLRewardAdapter/mean": 0.8004361987113953, "rewards/TRLRewardAdapter/std": 0.2794245481491089, "sampling/importance_sampling_ratio/max": 1.269980788230896, "sampling/importance_sampling_ratio/mean": 0.035489533096551895, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.0, "sampling/sampling_logp_difference/mean": 0.02028280310332775, "step": 55, "step_time": 161.9516579337651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9471.0, "completions/mean_length": 1956.4052734375, "completions/mean_terminated_length": 1802.7060546875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.1607944816350937, "epoch": 0.037383177570093455, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.793067066136646e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 140532149.0, "reward": 0.70542973279953, "reward_std": 0.34934133291244507, "rewards/TRLRewardAdapter/mean": 0.70542973279953, "rewards/TRLRewardAdapter/std": 0.34934133291244507, "sampling/importance_sampling_ratio/max": 1.6100655794143677, "sampling/importance_sampling_ratio/mean": 0.025401661172509193, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.03597640991211, "sampling/sampling_logp_difference/mean": 0.019879231229424477, "step": 56, "step_time": 258.75526812381577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0520833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9985.0, "completions/mean_length": 2514.244873046875, "completions/mean_terminated_length": 2102.939697265625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.13408025602499643, "epoch": 0.038050734312416554, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 7.0743024948871706e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 143398784.0, "reward": 0.7334644198417664, "reward_std": 0.34076544642448425, "rewards/TRLRewardAdapter/mean": 0.7334643602371216, "rewards/TRLRewardAdapter/std": 0.34076544642448425, "sampling/importance_sampling_ratio/max": 0.6864460706710815, "sampling/importance_sampling_ratio/mean": 0.022327423095703125, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.969564437866211, "sampling/sampling_logp_difference/mean": 0.01708194613456726, "step": 57, "step_time": 392.4701303921174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 8191.0, "completions/mean_length": 1720.9022216796875, "completions/mean_terminated_length": 1703.6180419921875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.1558221528927485, "epoch": 0.03871829105473965, "frac_reward_zero_std": 0.0, "grad_norm": 1.1307355597824463e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 145453026.0, "reward": 0.742548406124115, "reward_std": 0.32103997468948364, "rewards/TRLRewardAdapter/mean": 0.7425483465194702, "rewards/TRLRewardAdapter/std": 0.32103994488716125, "sampling/importance_sampling_ratio/max": 0.930243968963623, "sampling/importance_sampling_ratio/mean": 0.02700837515294552, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.03125, "sampling/sampling_logp_difference/mean": 0.019471243023872375, "step": 58, "step_time": 160.54231168807019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03854166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9968.0, "completions/mean_length": 2150.572998046875, "completions/mean_terminated_length": 1835.91552734375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.14494732022285461, "epoch": 0.03938584779706275, "frac_reward_zero_std": 0.0, "grad_norm": 2.658075006130564e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 147998632.0, "reward": 0.7358861565589905, "reward_std": 0.3482910990715027, "rewards/TRLRewardAdapter/mean": 0.7358861565589905, "rewards/TRLRewardAdapter/std": 0.3482910990715027, "sampling/importance_sampling_ratio/max": 0.8738262057304382, "sampling/importance_sampling_ratio/mean": 0.02539180777966976, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.8449835777282715, "sampling/sampling_logp_difference/mean": 0.018572920933365822, "step": 59, "step_time": 327.89166044886224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9921.0, "completions/mean_length": 2082.259521484375, "completions/mean_terminated_length": 2065.729736328125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.15034519881010056, "epoch": 0.04005340453938585, "frac_reward_zero_std": 0.0, "grad_norm": 7.950526239986127e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 150443521.0, "reward": 0.7389928102493286, "reward_std": 0.32102087140083313, "rewards/TRLRewardAdapter/mean": 0.7389927506446838, "rewards/TRLRewardAdapter/std": 0.32102084159851074, "sampling/importance_sampling_ratio/max": 1.3730521202087402, "sampling/importance_sampling_ratio/mean": 0.03811652958393097, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.75, "sampling/sampling_logp_difference/mean": 0.019298991188406944, "step": 60, "step_time": 212.96853683702648 }, { "epoch": 0.04005340453938585, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.03739130330960388, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9600.95652173913, "eval_completions/mean_length": 2667.938423488451, "eval_completions/mean_terminated_length": 2383.2662831182065, "eval_completions/min_length": 160.65217391304347, "eval_completions/min_terminated_length": 160.65217391304347, "eval_entropy": 0.1669212009595788, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 150443521.0, "eval_reward": 0.67734803583311, "eval_reward_std": 0.3569710267626721, "eval_rewards/TRLRewardAdapter/mean": 0.6773480487906415, "eval_rewards/TRLRewardAdapter/std": 0.3569710319456847, "eval_runtime": 1493.4407, "eval_samples_per_second": 3.057, "eval_sampling/importance_sampling_ratio/max": 0.7145140533861907, "eval_sampling/importance_sampling_ratio/mean": 0.03311446892178577, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 13.698363573654838, "eval_sampling/sampling_logp_difference/mean": 0.020419338070180103, "eval_steps_per_second": 0.015, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9899.0, "completions/mean_length": 1248.53759765625, "completions/mean_terminated_length": 1033.7203369140625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.14217028270165125, "epoch": 0.04072096128170895, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00011796276124739197, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 152041157.0, "reward": 0.8696165084838867, "reward_std": 0.2162042260169983, "rewards/TRLRewardAdapter/mean": 0.8696164488792419, "rewards/TRLRewardAdapter/std": 0.21620424091815948, "sampling/importance_sampling_ratio/max": 1.257613182067871, "sampling/importance_sampling_ratio/mean": 0.037898264825344086, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.3840217590332, "sampling/sampling_logp_difference/mean": 0.01870436780154705, "step": 61, "step_time": 352.9036880597705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9916.0, "completions/mean_length": 2146.447998046875, "completions/mean_terminated_length": 2072.1240234375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.15932304908831915, "epoch": 0.04138851802403204, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.012904607620516e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 154526483.0, "reward": 0.6515312194824219, "reward_std": 0.39866989850997925, "rewards/TRLRewardAdapter/mean": 0.6515311598777771, "rewards/TRLRewardAdapter/std": 0.39866989850997925, "sampling/importance_sampling_ratio/max": 1.0056208372116089, "sampling/importance_sampling_ratio/mean": 0.02322417125105858, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.472009658813477, "sampling/sampling_logp_difference/mean": 0.019971273839473724, "step": 62, "step_time": 324.4705235080328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9905.0, "completions/mean_length": 2294.6845703125, "completions/mean_terminated_length": 2147.44921875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.16674247880776724, "epoch": 0.04205607476635514, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 1.1247042601455793e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 157165636.0, "reward": 0.6759431958198547, "reward_std": 0.3630845844745636, "rewards/TRLRewardAdapter/mean": 0.6759431958198547, "rewards/TRLRewardAdapter/std": 0.3630845844745636, "sampling/importance_sampling_ratio/max": 0.7682601809501648, "sampling/importance_sampling_ratio/mean": 0.023282865062355995, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.414257049560547, "sampling/sampling_logp_difference/mean": 0.020606715232133865, "step": 63, "step_time": 221.66806933598127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9970.0, "completions/mean_length": 2445.265869140625, "completions/mean_terminated_length": 2226.6396484375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.14454718679189682, "epoch": 0.042723631508678236, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 9.104722112966102e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 160028803.0, "reward": 0.662563681602478, "reward_std": 0.36037781834602356, "rewards/TRLRewardAdapter/mean": 0.6625636219978333, "rewards/TRLRewardAdapter/std": 0.36037781834602356, "sampling/importance_sampling_ratio/max": 0.8027393221855164, "sampling/importance_sampling_ratio/mean": 0.013363425619900227, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.0, "sampling/sampling_logp_difference/mean": 0.01815587468445301, "step": 64, "step_time": 280.10217868199106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9826.0, "completions/mean_length": 2168.016845703125, "completions/mean_terminated_length": 1889.2081298828125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.16137456893920898, "epoch": 0.043391188251001335, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 9.484603977220246e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 162517779.0, "reward": 0.7265400290489197, "reward_std": 0.35045552253723145, "rewards/TRLRewardAdapter/mean": 0.7265399694442749, "rewards/TRLRewardAdapter/std": 0.35045555233955383, "sampling/importance_sampling_ratio/max": 0.7286416888237, "sampling/importance_sampling_ratio/mean": 0.02126496285200119, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.20545196533203, "sampling/sampling_logp_difference/mean": 0.020290831103920937, "step": 65, "step_time": 394.74342836998403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9995.0, "completions/mean_length": 2169.8271484375, "completions/mean_terminated_length": 1899.8211669921875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.1651935875415802, "epoch": 0.044058744993324434, "frac_reward_zero_std": 0.0, "grad_norm": 6.361735125907562e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 165016173.0, "reward": 0.7482596039772034, "reward_std": 0.3258177936077118, "rewards/TRLRewardAdapter/mean": 0.7482595443725586, "rewards/TRLRewardAdapter/std": 0.3258177936077118, "sampling/importance_sampling_ratio/max": 1.216974139213562, "sampling/importance_sampling_ratio/mean": 0.0362599641084671, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.5, "sampling/sampling_logp_difference/mean": 0.020856812596321106, "step": 66, "step_time": 387.46971798781306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9463.0, "completions/mean_length": 1832.5927734375, "completions/mean_terminated_length": 1676.5277099609375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.14711909492810568, "epoch": 0.04472630173564753, "frac_reward_zero_std": 0.0, "grad_norm": 7.522568048695197e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 167189894.0, "reward": 0.7589426636695862, "reward_std": 0.3333408534526825, "rewards/TRLRewardAdapter/mean": 0.7589426040649414, "rewards/TRLRewardAdapter/std": 0.3333408534526825, "sampling/importance_sampling_ratio/max": 1.1472727060317993, "sampling/importance_sampling_ratio/mean": 0.04496336355805397, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.0, "sampling/sampling_logp_difference/mean": 0.01889060251414776, "step": 67, "step_time": 335.93690255994443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9939.0, "completions/mean_length": 2344.94287109375, "completions/mean_terminated_length": 2206.940673828125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.13546809057394663, "epoch": 0.04539385847797063, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.317273326038305e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 169885679.0, "reward": 0.6955230832099915, "reward_std": 0.35937732458114624, "rewards/TRLRewardAdapter/mean": 0.6955230832099915, "rewards/TRLRewardAdapter/std": 0.35937732458114624, "sampling/importance_sampling_ratio/max": 0.8100619316101074, "sampling/importance_sampling_ratio/mean": 0.030552079901099205, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.406776428222656, "sampling/sampling_logp_difference/mean": 0.017371172085404396, "step": 68, "step_time": 334.10541500896215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9950.0, "completions/mean_length": 2119.266845703125, "completions/mean_terminated_length": 1934.4307861328125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.14877265691757202, "epoch": 0.04606141522029372, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.587541589768494e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 172301615.0, "reward": 0.7075679898262024, "reward_std": 0.36388370394706726, "rewards/TRLRewardAdapter/mean": 0.7075679898262024, "rewards/TRLRewardAdapter/std": 0.3638836741447449, "sampling/importance_sampling_ratio/max": 1.2501400709152222, "sampling/importance_sampling_ratio/mean": 0.020456012338399887, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.0, "sampling/sampling_logp_difference/mean": 0.018799128010869026, "step": 69, "step_time": 304.7129064288456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05104167014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9988.0, "completions/mean_length": 2536.557373046875, "completions/mean_terminated_length": 2135.120849609375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.14444882422685623, "epoch": 0.04672897196261682, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.558528094589886e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 175144102.0, "reward": 0.6613590121269226, "reward_std": 0.3791937828063965, "rewards/TRLRewardAdapter/mean": 0.6613589525222778, "rewards/TRLRewardAdapter/std": 0.3791937530040741, "sampling/importance_sampling_ratio/max": 1.233285903930664, "sampling/importance_sampling_ratio/mean": 0.05510319396853447, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.164257049560547, "sampling/sampling_logp_difference/mean": 0.018613969907164574, "step": 70, "step_time": 325.7512485139305 }, { "epoch": 0.04672897196261682, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.035652172630247864, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9774.565217391304, "eval_completions/mean_length": 2658.926460597826, "eval_completions/mean_terminated_length": 2387.5854173743205, "eval_completions/min_length": 157.34782608695653, "eval_completions/min_terminated_length": 157.34782608695653, "eval_entropy": 0.16580300616181415, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 175144102.0, "eval_reward": 0.6802333904349286, "eval_reward_std": 0.3551672697067261, "eval_rewards/TRLRewardAdapter/mean": 0.6802334008009537, "eval_rewards/TRLRewardAdapter/std": 0.35516727229823236, "eval_runtime": 1476.0903, "eval_samples_per_second": 3.093, "eval_sampling/importance_sampling_ratio/max": 0.8909386538940928, "eval_sampling/importance_sampling_ratio/mean": 0.03523914305412251, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 14.83068623750106, "eval_sampling/sampling_logp_difference/mean": 0.020305679058251175, "eval_steps_per_second": 0.016, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9972.0, "completions/mean_length": 2838.899169921875, "completions/mean_terminated_length": 2647.42578125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.14133755614360174, "epoch": 0.04739652870493992, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.932590626603191e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 178317957.0, "reward": 0.6400629878044128, "reward_std": 0.3772102892398834, "rewards/TRLRewardAdapter/mean": 0.6400629878044128, "rewards/TRLRewardAdapter/std": 0.3772103190422058, "sampling/importance_sampling_ratio/max": 1.1096084117889404, "sampling/importance_sampling_ratio/mean": 0.031690943986177444, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.75, "sampling/sampling_logp_difference/mean": 0.01782938465476036, "step": 71, "step_time": 333.26652249414474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06145833805203438, "completions/max_length": 10000.0, "completions/max_terminated_length": 9893.0, "completions/mean_length": 2085.52099609375, "completions/mean_terminated_length": 1567.2586669921875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.13386343171199164, "epoch": 0.04806408544726302, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 7.965056986359468e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 180740761.0, "reward": 0.7376719117164612, "reward_std": 0.34587788581848145, "rewards/TRLRewardAdapter/mean": 0.7376718521118164, "rewards/TRLRewardAdapter/std": 0.34587791562080383, "sampling/importance_sampling_ratio/max": 1.2526075839996338, "sampling/importance_sampling_ratio/mean": 0.04742979630827904, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.5, "sampling/sampling_logp_difference/mean": 0.017150117084383965, "step": 72, "step_time": 392.9166680871276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9965.0, "completions/mean_length": 1862.682373046875, "completions/mean_terminated_length": 1573.0042724609375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.14536099632581076, "epoch": 0.048731642189586116, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 7.868526214105219e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 182941864.0, "reward": 0.75994473695755, "reward_std": 0.3328167200088501, "rewards/TRLRewardAdapter/mean": 0.75994473695755, "rewards/TRLRewardAdapter/std": 0.3328167200088501, "sampling/importance_sampling_ratio/max": 0.8337895274162292, "sampling/importance_sampling_ratio/mean": 0.02442414127290249, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.5, "sampling/sampling_logp_difference/mean": 0.01881137304008007, "step": 73, "step_time": 387.70367073616944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9970.0, "completions/mean_length": 2783.557373046875, "completions/mean_terminated_length": 2268.09716796875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.14974276721477509, "epoch": 0.049399198931909215, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 2.7934706240441464e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 186062463.0, "reward": 0.6291797757148743, "reward_std": 0.4120813012123108, "rewards/TRLRewardAdapter/mean": 0.6291797757148743, "rewards/TRLRewardAdapter/std": 0.4120812714099884, "sampling/importance_sampling_ratio/max": 1.3157711029052734, "sampling/importance_sampling_ratio/mean": 0.028286224231123924, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.0, "sampling/sampling_logp_difference/mean": 0.019144143909215927, "step": 74, "step_time": 391.8015195840271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9806.0, "completions/mean_length": 1852.5948486328125, "completions/mean_terminated_length": 1625.7933349609375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.13722233722607294, "epoch": 0.050066755674232306, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.5913177579267e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 188298138.0, "reward": 0.7668462991714478, "reward_std": 0.32614240050315857, "rewards/TRLRewardAdapter/mean": 0.7668462991714478, "rewards/TRLRewardAdapter/std": 0.3261423707008362, "sampling/importance_sampling_ratio/max": 0.911151647567749, "sampling/importance_sampling_ratio/mean": 0.04735877364873886, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.887245178222656, "sampling/sampling_logp_difference/mean": 0.017750313505530357, "step": 75, "step_time": 291.1565871370258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9973.0, "completions/mean_length": 1658.6251220703125, "completions/mean_terminated_length": 1398.7969970703125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.1412949115037918, "epoch": 0.050734312416555405, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.19330086053826e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 190327858.0, "reward": 0.7990599274635315, "reward_std": 0.2879991829395294, "rewards/TRLRewardAdapter/mean": 0.7990598678588867, "rewards/TRLRewardAdapter/std": 0.2879991829395294, "sampling/importance_sampling_ratio/max": 0.6500809192657471, "sampling/importance_sampling_ratio/mean": 0.035903919488191605, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.65446472167969, "sampling/sampling_logp_difference/mean": 0.018143681809306145, "step": 76, "step_time": 396.20533115707804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9825.0, "completions/mean_length": 1830.197998046875, "completions/mean_terminated_length": 1557.5780029296875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.13402414073546728, "epoch": 0.0514018691588785, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00010048087389925611, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 192515728.0, "reward": 0.7267057299613953, "reward_std": 0.35491371154785156, "rewards/TRLRewardAdapter/mean": 0.7267056703567505, "rewards/TRLRewardAdapter/std": 0.35491371154785156, "sampling/importance_sampling_ratio/max": 0.7732408046722412, "sampling/importance_sampling_ratio/mean": 0.03547185659408569, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.934267044067383, "sampling/sampling_logp_difference/mean": 0.017406335100531578, "step": 77, "step_time": 377.71342872513924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9998.0, "completions/mean_length": 1914.8958740234375, "completions/mean_terminated_length": 1689.82861328125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.1489210178454717, "epoch": 0.0520694259012016, "frac_reward_zero_std": 0.0, "grad_norm": 4.468677719056663e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 194791436.0, "reward": 0.7274760603904724, "reward_std": 0.35954663157463074, "rewards/TRLRewardAdapter/mean": 0.7274760007858276, "rewards/TRLRewardAdapter/std": 0.35954660177230835, "sampling/importance_sampling_ratio/max": 0.9771834015846252, "sampling/importance_sampling_ratio/mean": 0.03937491029500961, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.0, "sampling/sampling_logp_difference/mean": 0.01920836977660656, "step": 78, "step_time": 368.25277722009923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06145833805203438, "completions/max_length": 10000.0, "completions/max_terminated_length": 9939.0, "completions/mean_length": 2511.95947265625, "completions/mean_terminated_length": 2021.62158203125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.16210116694370905, "epoch": 0.0527369826435247, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.00013130615500437664, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 197624933.0, "reward": 0.6340721249580383, "reward_std": 0.39811307191848755, "rewards/TRLRewardAdapter/mean": 0.6340721249580383, "rewards/TRLRewardAdapter/std": 0.39811310172080994, "sampling/importance_sampling_ratio/max": 1.1340045928955078, "sampling/importance_sampling_ratio/mean": 0.01982637494802475, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.906387329101562, "sampling/sampling_logp_difference/mean": 0.020099647343158722, "step": 79, "step_time": 379.0357849432621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9924.0, "completions/mean_length": 1992.424072265625, "completions/mean_terminated_length": 1769.5150146484375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.1427620400985082, "epoch": 0.0534045393858478, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0002168943309472499, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 199950684.0, "reward": 0.729174017906189, "reward_std": 0.33642566204071045, "rewards/TRLRewardAdapter/mean": 0.7291739583015442, "rewards/TRLRewardAdapter/std": 0.33642569184303284, "sampling/importance_sampling_ratio/max": 1.0314520597457886, "sampling/importance_sampling_ratio/mean": 0.03351456671953201, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5, "sampling/sampling_logp_difference/mean": 0.018346285447478294, "step": 80, "step_time": 302.3161868050229 }, { "epoch": 0.0534045393858478, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.03782608582759681, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9701.130434782608, "eval_completions/mean_length": 2679.2797108525815, "eval_completions/mean_terminated_length": 2391.40038001019, "eval_completions/min_length": 159.17391304347825, "eval_completions/min_terminated_length": 159.17391304347825, "eval_entropy": 0.16493160504361856, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 199950684.0, "eval_reward": 0.676997941473256, "eval_reward_std": 0.35801317121671594, "eval_rewards/TRLRewardAdapter/mean": 0.6769979647968126, "eval_rewards/TRLRewardAdapter/std": 0.35801316862520965, "eval_runtime": 1498.2076, "eval_samples_per_second": 3.048, "eval_sampling/importance_sampling_ratio/max": 0.8462197430755781, "eval_sampling/importance_sampling_ratio/mean": 0.034992810662673866, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.06032337313113, "eval_sampling/sampling_logp_difference/mean": 0.02026832800196565, "eval_steps_per_second": 0.015, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9745.0, "completions/mean_length": 2265.08544921875, "completions/mean_terminated_length": 2142.30908203125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "entropy": 0.151587908466657, "epoch": 0.0540720961281709, "frac_reward_zero_std": 0.0, "grad_norm": 7.060776327383176e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 202622350.0, "reward": 0.7550315856933594, "reward_std": 0.275991827249527, "rewards/TRLRewardAdapter/mean": 0.7550315260887146, "rewards/TRLRewardAdapter/std": 0.275991827249527, "sampling/importance_sampling_ratio/max": 0.48406749963760376, "sampling/importance_sampling_ratio/mean": 0.007976965978741646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.637245178222656, "sampling/sampling_logp_difference/mean": 0.01868729665875435, "step": 81, "step_time": 333.47204646188766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05000000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9859.0, "completions/mean_length": 2660.248046875, "completions/mean_terminated_length": 2273.9453125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.16254321485757828, "epoch": 0.05473965287049399, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.095605124309459e-06, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 205656860.0, "reward": 0.5811673998832703, "reward_std": 0.40137696266174316, "rewards/TRLRewardAdapter/mean": 0.5811673402786255, "rewards/TRLRewardAdapter/std": 0.40137696266174316, "sampling/importance_sampling_ratio/max": 0.38509440422058105, "sampling/importance_sampling_ratio/mean": 0.009860657155513763, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.0, "sampling/sampling_logp_difference/mean": 0.02017413079738617, "step": 82, "step_time": 399.52434419374913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07604166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9966.0, "completions/mean_length": 3018.701171875, "completions/mean_terminated_length": 2444.140869140625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.15005473295847574, "epoch": 0.05540720961281709, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.871189319652983e-06, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 209039613.0, "reward": 0.601599931716919, "reward_std": 0.39142748713493347, "rewards/TRLRewardAdapter/mean": 0.6015998721122742, "rewards/TRLRewardAdapter/std": 0.39142748713493347, "sampling/importance_sampling_ratio/max": 0.5469470620155334, "sampling/importance_sampling_ratio/mean": 0.008920359425246716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 60.263919830322266, "sampling/sampling_logp_difference/mean": 0.019194064661860466, "step": 83, "step_time": 393.4791493578814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9969.0, "completions/mean_length": 2780.466796875, "completions/mean_terminated_length": 2555.583251953125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.15767975399891535, "epoch": 0.056074766355140186, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.7980090774832705e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 212235453.0, "reward": 0.61905837059021, "reward_std": 0.39095404744148254, "rewards/TRLRewardAdapter/mean": 0.61905837059021, "rewards/TRLRewardAdapter/std": 0.39095401763916016, "sampling/importance_sampling_ratio/max": 0.484703004360199, "sampling/importance_sampling_ratio/mean": 0.013957545161247253, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.75, "sampling/sampling_logp_difference/mean": 0.01983979530632496, "step": 84, "step_time": 303.7903692849213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05416667088866234, "completions/max_length": 10000.0, "completions/max_terminated_length": 9989.0, "completions/mean_length": 2359.180419921875, "completions/mean_terminated_length": 1921.60009765625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.13438516110181808, "epoch": 0.056742323097463285, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 5.272938029575077e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 214936362.0, "reward": 0.6936622858047485, "reward_std": 0.3909655809402466, "rewards/TRLRewardAdapter/mean": 0.6936622858047485, "rewards/TRLRewardAdapter/std": 0.3909655809402466, "sampling/importance_sampling_ratio/max": 1.720121145248413, "sampling/importance_sampling_ratio/mean": 0.05531347915530205, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.0, "sampling/sampling_logp_difference/mean": 0.017275039106607437, "step": 85, "step_time": 385.5425960799912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05104167014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9983.0, "completions/mean_length": 2700.42822265625, "completions/mean_terminated_length": 2307.8056640625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.13827123244603476, "epoch": 0.05740987983978638, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.56224102481842e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 217973285.0, "reward": 0.6207055449485779, "reward_std": 0.3931494951248169, "rewards/TRLRewardAdapter/mean": 0.6207055449485779, "rewards/TRLRewardAdapter/std": 0.3931494951248169, "sampling/importance_sampling_ratio/max": 2.220673084259033, "sampling/importance_sampling_ratio/mean": 0.03011242114007473, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.25, "sampling/sampling_logp_difference/mean": 0.017564833164215088, "step": 86, "step_time": 370.1403947339859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03854166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9878.0, "completions/mean_length": 2241.466796875, "completions/mean_terminated_length": 1930.452880859375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.13359994689623514, "epoch": 0.05807743658210948, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.888901332348103e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 220593797.0, "reward": 0.7682173848152161, "reward_std": 0.3108425736427307, "rewards/TRLRewardAdapter/mean": 0.7682173252105713, "rewards/TRLRewardAdapter/std": 0.3108425736427307, "sampling/importance_sampling_ratio/max": 1.9385888576507568, "sampling/importance_sampling_ratio/mean": 0.0405079685151577, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.460601806640625, "sampling/sampling_logp_difference/mean": 0.016870247200131416, "step": 87, "step_time": 382.6960062190192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06041666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9941.0, "completions/mean_length": 2631.3427734375, "completions/mean_terminated_length": 2157.526611328125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.1350964034597079, "epoch": 0.05874499332443257, "frac_reward_zero_std": 0.0, "grad_norm": 1.0511186518822007e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 223556110.0, "reward": 0.6751776933670044, "reward_std": 0.36552193760871887, "rewards/TRLRewardAdapter/mean": 0.6751776337623596, "rewards/TRLRewardAdapter/std": 0.3655219078063965, "sampling/importance_sampling_ratio/max": 0.9123778343200684, "sampling/importance_sampling_ratio/mean": 0.024781782180070877, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.094983100891113, "sampling/sampling_logp_difference/mean": 0.01719815842807293, "step": 88, "step_time": 344.6582413249416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9845.0, "completions/mean_length": 2480.721923828125, "completions/mean_terminated_length": 2345.167724609375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.16375661889712015, "epoch": 0.05941255006675567, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.3529333004646694e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 226377891.0, "reward": 0.6971970796585083, "reward_std": 0.3449723422527313, "rewards/TRLRewardAdapter/mean": 0.6971970200538635, "rewards/TRLRewardAdapter/std": 0.34497231245040894, "sampling/importance_sampling_ratio/max": 0.5510303378105164, "sampling/importance_sampling_ratio/mean": 0.010871204547584057, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.875, "sampling/sampling_logp_difference/mean": 0.020559627562761307, "step": 89, "step_time": 299.1213160697371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250002793967724, "completions/max_length": 10000.0, "completions/max_terminated_length": 9234.0, "completions/mean_length": 2003.72509765625, "completions/mean_terminated_length": 1978.658203125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1286503771940867, "epoch": 0.06008010680907877, "frac_reward_zero_std": 0.0, "grad_norm": 5.955417409852086e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 228729659.0, "reward": 0.7722655534744263, "reward_std": 0.2801056206226349, "rewards/TRLRewardAdapter/mean": 0.7722654938697815, "rewards/TRLRewardAdapter/std": 0.2801056206226349, "sampling/importance_sampling_ratio/max": 1.350596308708191, "sampling/importance_sampling_ratio/mean": 0.0644492581486702, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.25, "sampling/sampling_logp_difference/mean": 0.016756439581513405, "step": 90, "step_time": 212.41032147884835 }, { "epoch": 0.06008010680907877, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.034999998731781605, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9671.0, "eval_completions/mean_length": 2677.3153978430705, "eval_completions/mean_terminated_length": 2411.456925101902, "eval_completions/min_length": 156.8695652173913, "eval_completions/min_terminated_length": 156.8695652173913, "eval_entropy": 0.1629978165678356, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 228729659.0, "eval_reward": 0.6752638428107552, "eval_reward_std": 0.3557957281237063, "eval_rewards/TRLRewardAdapter/mean": 0.6752638557682866, "eval_rewards/TRLRewardAdapter/std": 0.3557957307152126, "eval_runtime": 1474.3121, "eval_samples_per_second": 3.097, "eval_sampling/importance_sampling_ratio/max": 0.9181544093982034, "eval_sampling/importance_sampling_ratio/mean": 0.033565782415478126, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 13.76831807260928, "eval_sampling/sampling_logp_difference/mean": 0.0200790551531574, "eval_steps_per_second": 0.016, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9949.0, "completions/mean_length": 2197.25634765625, "completions/mean_terminated_length": 1875.6680908203125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.1458988959590594, "epoch": 0.06074766355140187, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.000137848705519037, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 231320945.0, "reward": 0.7134481072425842, "reward_std": 0.3452042043209076, "rewards/TRLRewardAdapter/mean": 0.7134481072425842, "rewards/TRLRewardAdapter/std": 0.3452042043209076, "sampling/importance_sampling_ratio/max": 1.5020859241485596, "sampling/importance_sampling_ratio/mean": 0.05105439946055412, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 59.806854248046875, "sampling/sampling_logp_difference/mean": 0.018633682280778885, "step": 91, "step_time": 288.9141471219482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9993.0, "completions/mean_length": 2256.25, "completions/mean_terminated_length": 2066.16845703125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.1421996826926867, "epoch": 0.06141522029372497, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.1716244095977037e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 233970337.0, "reward": 0.7449586391448975, "reward_std": 0.3284967541694641, "rewards/TRLRewardAdapter/mean": 0.7449586391448975, "rewards/TRLRewardAdapter/std": 0.3284967541694641, "sampling/importance_sampling_ratio/max": 1.6878645420074463, "sampling/importance_sampling_ratio/mean": 0.04522738605737686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 63.0, "sampling/sampling_logp_difference/mean": 0.018111854791641235, "step": 92, "step_time": 278.5600901239086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04791666939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9875.0, "completions/mean_length": 2459.331298828125, "completions/mean_terminated_length": 2079.82275390625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.14456617087125778, "epoch": 0.062082777036048066, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.7555137226421894e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 236838047.0, "reward": 0.7351071834564209, "reward_std": 0.31546133756637573, "rewards/TRLRewardAdapter/mean": 0.7351071834564209, "rewards/TRLRewardAdapter/std": 0.31546130776405334, "sampling/importance_sampling_ratio/max": 1.260790228843689, "sampling/importance_sampling_ratio/mean": 0.05147914960980415, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.213584899902344, "sampling/sampling_logp_difference/mean": 0.018336139619350433, "step": 93, "step_time": 366.3280927699525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9590.0, "completions/mean_length": 2000.5897216796875, "completions/mean_terminated_length": 1899.3311767578125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.1426944062113762, "epoch": 0.06275033377837116, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.242106291441243e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 239214261.0, "reward": 0.7551478743553162, "reward_std": 0.3180229067802429, "rewards/TRLRewardAdapter/mean": 0.7551478147506714, "rewards/TRLRewardAdapter/std": 0.3180229067802429, "sampling/importance_sampling_ratio/max": 1.5415446758270264, "sampling/importance_sampling_ratio/mean": 0.05270615220069885, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.8449835777282715, "sampling/sampling_logp_difference/mean": 0.018265053629875183, "step": 94, "step_time": 248.01226840086747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06354167312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9996.0, "completions/mean_length": 2716.19189453125, "completions/mean_terminated_length": 2221.962158203125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.14077825844287872, "epoch": 0.06341789052069426, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.2035045984615976e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 242301997.0, "reward": 0.6847456693649292, "reward_std": 0.37937992811203003, "rewards/TRLRewardAdapter/mean": 0.6847456693649292, "rewards/TRLRewardAdapter/std": 0.37937992811203003, "sampling/importance_sampling_ratio/max": 1.245941400527954, "sampling/importance_sampling_ratio/mean": 0.06092015653848648, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.112422943115234, "sampling/sampling_logp_difference/mean": 0.018098922446370125, "step": 95, "step_time": 337.14323221624363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9862.0, "completions/mean_length": 1858.39697265625, "completions/mean_terminated_length": 1737.907958984375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.14471669495105743, "epoch": 0.06408544726301736, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.409973561812695e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 244520074.0, "reward": 0.7525879144668579, "reward_std": 0.337872177362442, "rewards/TRLRewardAdapter/mean": 0.7525879144668579, "rewards/TRLRewardAdapter/std": 0.3378722071647644, "sampling/importance_sampling_ratio/max": 1.8474104404449463, "sampling/importance_sampling_ratio/mean": 0.0300491563975811, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.25, "sampling/sampling_logp_difference/mean": 0.018526067957282066, "step": 96, "step_time": 277.3910303299781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9970.0, "completions/mean_length": 2651.52294921875, "completions/mean_terminated_length": 2332.02392578125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.1383460263411204, "epoch": 0.06475300400534045, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 2.5806449403262047e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 247544096.0, "reward": 0.5895397663116455, "reward_std": 0.4292527139186859, "rewards/TRLRewardAdapter/mean": 0.5895397067070007, "rewards/TRLRewardAdapter/std": 0.4292527139186859, "sampling/importance_sampling_ratio/max": 0.6505938172340393, "sampling/importance_sampling_ratio/mean": 0.028609365224838257, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.0, "sampling/sampling_logp_difference/mean": 0.01774783618748188, "step": 97, "step_time": 362.7196400988614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9263.0, "completions/mean_length": 2190.883544921875, "completions/mean_terminated_length": 2182.740478515625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.16144517560799918, "epoch": 0.06542056074766354, "frac_reward_zero_std": 0.0, "grad_norm": 2.2735210949301408e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 250080496.0, "reward": 0.7399779558181763, "reward_std": 0.3022691309452057, "rewards/TRLRewardAdapter/mean": 0.7399778962135315, "rewards/TRLRewardAdapter/std": 0.3022691309452057, "sampling/importance_sampling_ratio/max": 0.39036208391189575, "sampling/importance_sampling_ratio/mean": 0.014772331342101097, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.75, "sampling/sampling_logp_difference/mean": 0.020279688760638237, "step": 98, "step_time": 240.20208841201384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04375000298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9891.0, "completions/mean_length": 2643.091796875, "completions/mean_terminated_length": 2306.5009765625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.15731064726909003, "epoch": 0.06608811748998665, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.139668412573616e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 253047464.0, "reward": 0.6131888628005981, "reward_std": 0.4177123010158539, "rewards/TRLRewardAdapter/mean": 0.6131888031959534, "rewards/TRLRewardAdapter/std": 0.4177123010158539, "sampling/importance_sampling_ratio/max": 1.3352314233779907, "sampling/importance_sampling_ratio/mean": 0.025230878964066505, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.63401985168457, "sampling/sampling_logp_difference/mean": 0.02000582590699196, "step": 99, "step_time": 373.0483907449525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9891.0, "completions/mean_length": 2302.0908203125, "completions/mean_terminated_length": 2113.134521484375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.1636600742737452, "epoch": 0.06675567423230974, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.361874912003742e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 255660415.0, "reward": 0.6913338303565979, "reward_std": 0.3497500419616699, "rewards/TRLRewardAdapter/mean": 0.6913337707519531, "rewards/TRLRewardAdapter/std": 0.34975001215934753, "sampling/importance_sampling_ratio/max": 1.200569987297058, "sampling/importance_sampling_ratio/mean": 0.017374275252223015, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.75, "sampling/sampling_logp_difference/mean": 0.020559554919600487, "step": 100, "step_time": 268.73075630003586 }, { "epoch": 0.06675567423230974, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0354347816546974, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9721.391304347826, "eval_completions/mean_length": 2661.774955417799, "eval_completions/mean_terminated_length": 2391.7133152173915, "eval_completions/min_length": 153.8695652173913, "eval_completions/min_terminated_length": 153.8695652173913, "eval_entropy": 0.16317026576270227, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 255660415.0, "eval_reward": 0.6823054215182429, "eval_reward_std": 0.3529604686343152, "eval_rewards/TRLRewardAdapter/mean": 0.682305439658787, "eval_rewards/TRLRewardAdapter/std": 0.35296047122582147, "eval_runtime": 1500.1814, "eval_samples_per_second": 3.044, "eval_sampling/importance_sampling_ratio/max": 0.8863680012848066, "eval_sampling/importance_sampling_ratio/mean": 0.03699261201140673, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 13.034526223721711, "eval_sampling/sampling_logp_difference/mean": 0.020073617606059364, "eval_steps_per_second": 0.015, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06458333879709244, "completions/max_length": 10000.0, "completions/max_terminated_length": 9987.0, "completions/mean_length": 2716.86376953125, "completions/mean_terminated_length": 2214.01904296875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.1626757209499677, "epoch": 0.06742323097463285, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.92333739392485e-05, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 258717212.0, "reward": 0.6664325594902039, "reward_std": 0.3719951808452606, "rewards/TRLRewardAdapter/mean": 0.6664324998855591, "rewards/TRLRewardAdapter/std": 0.3719951808452606, "sampling/importance_sampling_ratio/max": 0.3236977159976959, "sampling/importance_sampling_ratio/mean": 0.009704610332846642, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.279460906982422, "sampling/sampling_logp_difference/mean": 0.020007988438010216, "step": 101, "step_time": 363.48943146306556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9996.0, "completions/mean_length": 2217.969970703125, "completions/mean_terminated_length": 1932.2364501953125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.1270774652560552, "epoch": 0.06809078771695594, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 6.338267918398242e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 261278847.0, "reward": 0.681500256061554, "reward_std": 0.395784467458725, "rewards/TRLRewardAdapter/mean": 0.681500256061554, "rewards/TRLRewardAdapter/std": 0.395784467458725, "sampling/importance_sampling_ratio/max": 1.6667042970657349, "sampling/importance_sampling_ratio/mean": 0.075107142329216, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.5, "sampling/sampling_logp_difference/mean": 0.016786491498351097, "step": 102, "step_time": 360.78844368993305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9938.0, "completions/mean_length": 2226.8115234375, "completions/mean_terminated_length": 2018.9722900390625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.13397220025459924, "epoch": 0.06875834445927904, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.630486698580379e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 263882218.0, "reward": 0.7362752556800842, "reward_std": 0.3222164809703827, "rewards/TRLRewardAdapter/mean": 0.7362752556800842, "rewards/TRLRewardAdapter/std": 0.3222164809703827, "sampling/importance_sampling_ratio/max": 0.8529259562492371, "sampling/importance_sampling_ratio/mean": 0.028011253103613853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.0, "sampling/sampling_logp_difference/mean": 0.01741555891931057, "step": 103, "step_time": 383.51708911685273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05937500298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9994.0, "completions/mean_length": 3340.235595703125, "completions/mean_terminated_length": 2919.8515625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.15842589239279428, "epoch": 0.06942590120160214, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.524552878572733e-06, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 267591788.0, "reward": 0.5037292242050171, "reward_std": 0.4092491567134857, "rewards/TRLRewardAdapter/mean": 0.5037291646003723, "rewards/TRLRewardAdapter/std": 0.4092491865158081, "sampling/importance_sampling_ratio/max": 1.0899382829666138, "sampling/importance_sampling_ratio/mean": 0.025424133986234665, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.5, "sampling/sampling_logp_difference/mean": 0.019492268562316895, "step": 104, "step_time": 379.35866772115696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 6994.0, "completions/mean_length": 1273.2437744140625, "completions/mean_terminated_length": 1255.0250244140625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.13694378485282263, "epoch": 0.07009345794392523, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 1.0044323272051776e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 269206550.0, "reward": 0.6985800862312317, "reward_std": 0.3866020441055298, "rewards/TRLRewardAdapter/mean": 0.6985800266265869, "rewards/TRLRewardAdapter/std": 0.3866020143032074, "sampling/importance_sampling_ratio/max": 0.7879406213760376, "sampling/importance_sampling_ratio/mean": 0.027954060584306717, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.5568528175354, "sampling/sampling_logp_difference/mean": 0.017942676320672035, "step": 105, "step_time": 108.60235662502237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9769.0, "completions/mean_length": 2329.978271484375, "completions/mean_terminated_length": 2257.39111328125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.15800411502520242, "epoch": 0.07076101468624833, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 6.678093628636355e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 271916129.0, "reward": 0.6606379747390747, "reward_std": 0.37899860739707947, "rewards/TRLRewardAdapter/mean": 0.6606379151344299, "rewards/TRLRewardAdapter/std": 0.37899860739707947, "sampling/importance_sampling_ratio/max": 0.7154010534286499, "sampling/importance_sampling_ratio/mean": 0.017179226502776146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.11679458618164, "sampling/sampling_logp_difference/mean": 0.020003406330943108, "step": 106, "step_time": 213.37511706980877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06458333879709244, "completions/max_length": 10000.0, "completions/max_terminated_length": 9871.0, "completions/mean_length": 2361.415771484375, "completions/mean_terminated_length": 1834.0301513671875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.12839752932389578, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 3.07587266843966e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 274636176.0, "reward": 0.6671415567398071, "reward_std": 0.4046602249145508, "rewards/TRLRewardAdapter/mean": 0.6671415567398071, "rewards/TRLRewardAdapter/std": 0.4046602249145508, "sampling/importance_sampling_ratio/max": 0.6048396825790405, "sampling/importance_sampling_ratio/mean": 0.036186255514621735, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.068679809570312, "sampling/sampling_logp_difference/mean": 0.01696709729731083, "step": 107, "step_time": 380.9905782500282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9969.0, "completions/mean_length": 1948.7239990234375, "completions/mean_terminated_length": 1820.9259033203125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.1378732994198799, "epoch": 0.07209612817089453, "frac_reward_zero_std": 0.0, "grad_norm": 1.2184237104359538e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 276935303.0, "reward": 0.699522078037262, "reward_std": 0.37042608857154846, "rewards/TRLRewardAdapter/mean": 0.6995220184326172, "rewards/TRLRewardAdapter/std": 0.37042608857154846, "sampling/importance_sampling_ratio/max": 2.03745436668396, "sampling/importance_sampling_ratio/mean": 0.03697923943400383, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.03516960144043, "sampling/sampling_logp_difference/mean": 0.01785460114479065, "step": 108, "step_time": 322.7427495152224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9514.0, "completions/mean_length": 2027.6729736328125, "completions/mean_terminated_length": 1969.1142578125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.14827200025320053, "epoch": 0.07276368491321762, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.552873580875549e-05, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 279311437.0, "reward": 0.7096471786499023, "reward_std": 0.34718069434165955, "rewards/TRLRewardAdapter/mean": 0.7096471190452576, "rewards/TRLRewardAdapter/std": 0.34718066453933716, "sampling/importance_sampling_ratio/max": 1.5298820734024048, "sampling/importance_sampling_ratio/mean": 0.04458180069923401, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.539353847503662, "sampling/sampling_logp_difference/mean": 0.019058087840676308, "step": 109, "step_time": 262.48713790089823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9946.0, "completions/mean_length": 1582.315673828125, "completions/mean_terminated_length": 1338.7169189453125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.1356815298398336, "epoch": 0.07343124165554073, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.1497191541377741e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 281285628.0, "reward": 0.8262121081352234, "reward_std": 0.2615508735179901, "rewards/TRLRewardAdapter/mean": 0.8262120485305786, "rewards/TRLRewardAdapter/std": 0.2615508735179901, "sampling/importance_sampling_ratio/max": 1.0249249935150146, "sampling/importance_sampling_ratio/mean": 0.03145848587155342, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.53516960144043, "sampling/sampling_logp_difference/mean": 0.0181068517267704, "step": 110, "step_time": 389.6709736858029 }, { "epoch": 0.07343124165554073, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0371739120910997, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9650.826086956522, "eval_completions/mean_length": 2586.824940557065, "eval_completions/mean_terminated_length": 2300.635683806046, "eval_completions/min_length": 150.04347826086956, "eval_completions/min_terminated_length": 150.04347826086956, "eval_entropy": 0.15868248434170432, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 281285628.0, "eval_reward": 0.6838794117388518, "eval_reward_std": 0.35755952026533044, "eval_rewards/TRLRewardAdapter/mean": 0.683879422104877, "eval_rewards/TRLRewardAdapter/std": 0.357559525448343, "eval_runtime": 1484.8014, "eval_samples_per_second": 3.075, "eval_sampling/importance_sampling_ratio/max": 1.0733686789222385, "eval_sampling/importance_sampling_ratio/mean": 0.04419691593426725, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 10.507311292316603, "eval_sampling/sampling_logp_difference/mean": 0.01979201546181803, "eval_steps_per_second": 0.015, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9774.0, "completions/mean_length": 2168.6083984375, "completions/mean_terminated_length": 1863.4891357421875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.14211047689119974, "epoch": 0.07409879839786382, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 2.762075799265439e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 283759364.0, "reward": 0.7307542562484741, "reward_std": 0.3685877323150635, "rewards/TRLRewardAdapter/mean": 0.7307541966438293, "rewards/TRLRewardAdapter/std": 0.36858776211738586, "sampling/importance_sampling_ratio/max": 1.8532450199127197, "sampling/importance_sampling_ratio/mean": 0.06059015542268753, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.25, "sampling/sampling_logp_difference/mean": 0.018408263102173805, "step": 111, "step_time": 364.71903429192025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05937500298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9945.0, "completions/mean_length": 2549.485595703125, "completions/mean_terminated_length": 2079.187255859375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.12491912394762039, "epoch": 0.07476635514018691, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.124029537824762e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 286657302.0, "reward": 0.6680839657783508, "reward_std": 0.3786970376968384, "rewards/TRLRewardAdapter/mean": 0.6680839657783508, "rewards/TRLRewardAdapter/std": 0.378697007894516, "sampling/importance_sampling_ratio/max": 1.111661434173584, "sampling/importance_sampling_ratio/mean": 0.05194465070962906, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.5, "sampling/sampling_logp_difference/mean": 0.016423916444182396, "step": 112, "step_time": 358.2157700159587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07187500596046448, "completions/max_length": 10000.0, "completions/max_terminated_length": 9981.0, "completions/mean_length": 2870.518798828125, "completions/mean_terminated_length": 2318.404052734375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.14289992054303488, "epoch": 0.07543391188251002, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 5.476308504968447e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 289869000.0, "reward": 0.6486866474151611, "reward_std": 0.4078255593776703, "rewards/TRLRewardAdapter/mean": 0.6486865878105164, "rewards/TRLRewardAdapter/std": 0.4078255593776703, "sampling/importance_sampling_ratio/max": 1.1474806070327759, "sampling/importance_sampling_ratio/mean": 0.027241289615631104, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.75, "sampling/sampling_logp_difference/mean": 0.018427712842822075, "step": 113, "step_time": 368.93629105505534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 8781.0, "completions/mean_length": 2302.13134765625, "completions/mean_terminated_length": 2036.6875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.13633264849583307, "epoch": 0.07610146862483311, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.0345171797071992e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 292550374.0, "reward": 0.680455207824707, "reward_std": 0.3437182903289795, "rewards/TRLRewardAdapter/mean": 0.6804551482200623, "rewards/TRLRewardAdapter/std": 0.3437182903289795, "sampling/importance_sampling_ratio/max": 1.4839133024215698, "sampling/importance_sampling_ratio/mean": 0.02404327318072319, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.84498405456543, "sampling/sampling_logp_difference/mean": 0.017463622614741325, "step": 114, "step_time": 380.09064431814477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9972.0, "completions/mean_length": 2453.541748046875, "completions/mean_terminated_length": 2284.77099609375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.13463238378365835, "epoch": 0.07676902536715621, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.827281265950944e-05, "learning_rate": 5e-06, "loss": 0.0017, "num_tokens": 295354414.0, "reward": 0.7011988162994385, "reward_std": 0.3466876745223999, "rewards/TRLRewardAdapter/mean": 0.7011987566947937, "rewards/TRLRewardAdapter/std": 0.3466876745223999, "sampling/importance_sampling_ratio/max": 0.8680779337882996, "sampling/importance_sampling_ratio/mean": 0.035540953278541565, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.862422943115234, "sampling/sampling_logp_difference/mean": 0.017390254884958267, "step": 115, "step_time": 297.8712749632541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04270833730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9968.0, "completions/mean_length": 2953.541748046875, "completions/mean_terminated_length": 2639.1728515625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.12944173316160837, "epoch": 0.0774365821094793, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.521420755760809e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 298628278.0, "reward": 0.6558054089546204, "reward_std": 0.370282918214798, "rewards/TRLRewardAdapter/mean": 0.6558054089546204, "rewards/TRLRewardAdapter/std": 0.370282918214798, "sampling/importance_sampling_ratio/max": 1.7290534973144531, "sampling/importance_sampling_ratio/mean": 0.07123389840126038, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.285170555114746, "sampling/sampling_logp_difference/mean": 0.01659364439547062, "step": 116, "step_time": 354.80985788209364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9958.0, "completions/mean_length": 2415.67626953125, "completions/mean_terminated_length": 2278.94921875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1290756290157636, "epoch": 0.07810413885180241, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 2.4834709332456294e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 301375071.0, "reward": 0.6837696433067322, "reward_std": 0.38381534814834595, "rewards/TRLRewardAdapter/mean": 0.6837695837020874, "rewards/TRLRewardAdapter/std": 0.38381531834602356, "sampling/importance_sampling_ratio/max": 1.7397611141204834, "sampling/importance_sampling_ratio/mean": 0.05955116078257561, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.362421035766602, "sampling/sampling_logp_difference/mean": 0.016752973198890686, "step": 117, "step_time": 348.4998032071162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0885416716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9986.0, "completions/mean_length": 2864.80224609375, "completions/mean_terminated_length": 2171.66845703125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.12191026533643405, "epoch": 0.0787716955941255, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 6.296000195622732e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 304562369.0, "reward": 0.6391050219535828, "reward_std": 0.40427348017692566, "rewards/TRLRewardAdapter/mean": 0.639104962348938, "rewards/TRLRewardAdapter/std": 0.40427348017692566, "sampling/importance_sampling_ratio/max": 1.369431972503662, "sampling/importance_sampling_ratio/mean": 0.03710899502038956, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.056854248046875, "sampling/sampling_logp_difference/mean": 0.015965700149536133, "step": 118, "step_time": 393.4261315149488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04375000298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9946.0, "completions/mean_length": 1722.651123046875, "completions/mean_terminated_length": 1343.94873046875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.11877363920211792, "epoch": 0.0794392523364486, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00012086519944899602, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 306639186.0, "reward": 0.759742259979248, "reward_std": 0.3495451509952545, "rewards/TRLRewardAdapter/mean": 0.7597422003746033, "rewards/TRLRewardAdapter/std": 0.3495451509952545, "sampling/importance_sampling_ratio/max": 1.507949948310852, "sampling/importance_sampling_ratio/mean": 0.07925907522439957, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.814239501953125, "sampling/sampling_logp_difference/mean": 0.016102928668260574, "step": 119, "step_time": 391.83013640902936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9980.0, "completions/mean_length": 2634.5126953125, "completions/mean_terminated_length": 1964.922607421875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.1419415126244227, "epoch": 0.0801068090787717, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 7.203284775066432e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 309575134.0, "reward": 0.6144430637359619, "reward_std": 0.41007378697395325, "rewards/TRLRewardAdapter/mean": 0.6144430041313171, "rewards/TRLRewardAdapter/std": 0.41007378697395325, "sampling/importance_sampling_ratio/max": 1.0754402875900269, "sampling/importance_sampling_ratio/mean": 0.03898593783378601, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.389068603515625, "sampling/sampling_logp_difference/mean": 0.018080348148941994, "step": 120, "step_time": 381.3020469429903 }, { "epoch": 0.0801068090787717, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0354347816546974, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9721.608695652174, "eval_completions/mean_length": 2551.782131029212, "eval_completions/mean_terminated_length": 2278.177346934443, "eval_completions/min_length": 147.69565217391303, "eval_completions/min_terminated_length": 147.69565217391303, "eval_entropy": 0.15825442127559497, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 309575134.0, "eval_reward": 0.6852614490882211, "eval_reward_std": 0.35820923162543256, "eval_rewards/TRLRewardAdapter/mean": 0.6852614594542462, "eval_rewards/TRLRewardAdapter/std": 0.358209235512692, "eval_runtime": 1486.6849, "eval_samples_per_second": 3.071, "eval_sampling/importance_sampling_ratio/max": 0.9480521445688994, "eval_sampling/importance_sampling_ratio/mean": 0.04812641814351082, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.392459164495053, "eval_sampling/sampling_logp_difference/mean": 0.019747365590022957, "eval_steps_per_second": 0.015, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9967.0, "completions/mean_length": 2229.367919921875, "completions/mean_terminated_length": 2004.4940185546875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.14791556199391684, "epoch": 0.08077436582109479, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.828065037519715e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 312181695.0, "reward": 0.649276614189148, "reward_std": 0.4037320017814636, "rewards/TRLRewardAdapter/mean": 0.649276614189148, "rewards/TRLRewardAdapter/std": 0.40373197197914124, "sampling/importance_sampling_ratio/max": 1.4255483150482178, "sampling/importance_sampling_ratio/mean": 0.06448955088853836, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.889068603515625, "sampling/sampling_logp_difference/mean": 0.01917499117553234, "step": 121, "step_time": 292.9878922898788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9958.0, "completions/mean_length": 2131.839599609375, "completions/mean_terminated_length": 1955.874267578125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.12868883833289146, "epoch": 0.0814419225634179, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.779535633457183e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 314675301.0, "reward": 0.6742384433746338, "reward_std": 0.3880481719970703, "rewards/TRLRewardAdapter/mean": 0.674238383769989, "rewards/TRLRewardAdapter/std": 0.3880481421947479, "sampling/importance_sampling_ratio/max": 0.7546657919883728, "sampling/importance_sampling_ratio/mean": 0.02844267524778843, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.101531982421875, "sampling/sampling_logp_difference/mean": 0.017138971015810966, "step": 122, "step_time": 357.8990430969279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9808.0, "completions/mean_length": 1583.9532470703125, "completions/mean_terminated_length": 1575.17724609375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.13454273343086243, "epoch": 0.08210947930574099, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00012209824455959142, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 316681208.0, "reward": 0.7204576134681702, "reward_std": 0.3507741391658783, "rewards/TRLRewardAdapter/mean": 0.7204576134681702, "rewards/TRLRewardAdapter/std": 0.3507741391658783, "sampling/importance_sampling_ratio/max": 1.0937618017196655, "sampling/importance_sampling_ratio/mean": 0.02557499334216118, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.222846031188965, "sampling/sampling_logp_difference/mean": 0.01755525730550289, "step": 123, "step_time": 183.55984387686476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9965.0, "completions/mean_length": 2222.675048828125, "completions/mean_terminated_length": 1997.607666015625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.1349995881319046, "epoch": 0.08277703604806408, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00012002968402174833, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 319301280.0, "reward": 0.7282646298408508, "reward_std": 0.3517504036426544, "rewards/TRLRewardAdapter/mean": 0.7282646298408508, "rewards/TRLRewardAdapter/std": 0.35175037384033203, "sampling/importance_sampling_ratio/max": 0.816942572593689, "sampling/importance_sampling_ratio/mean": 0.033260196447372437, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.0, "sampling/sampling_logp_difference/mean": 0.01754726469516754, "step": 124, "step_time": 260.67253338510636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02500000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9973.0, "completions/mean_length": 2258.25634765625, "completions/mean_terminated_length": 2059.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.15745735416809717, "epoch": 0.08344459279038718, "frac_reward_zero_std": 0.0, "grad_norm": 3.18922629037599e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 321898966.0, "reward": 0.6639809608459473, "reward_std": 0.38727834820747375, "rewards/TRLRewardAdapter/mean": 0.6639809012413025, "rewards/TRLRewardAdapter/std": 0.38727831840515137, "sampling/importance_sampling_ratio/max": 1.677968978881836, "sampling/importance_sampling_ratio/mean": 0.05420904606580734, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.75, "sampling/sampling_logp_difference/mean": 0.020272715017199516, "step": 125, "step_time": 267.4458894281415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250002793967724, "completions/max_length": 10000.0, "completions/max_terminated_length": 9547.0, "completions/mean_length": 1775.4698486328125, "completions/mean_terminated_length": 1749.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.14140631010135016, "epoch": 0.08411214953271028, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00011003828545134753, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 324026041.0, "reward": 0.7158477902412415, "reward_std": 0.3477281332015991, "rewards/TRLRewardAdapter/mean": 0.7158477902412415, "rewards/TRLRewardAdapter/std": 0.3477281332015991, "sampling/importance_sampling_ratio/max": 1.695421576499939, "sampling/importance_sampling_ratio/mean": 0.057005055248737335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.112419128417969, "sampling/sampling_logp_difference/mean": 0.01796667091548443, "step": 126, "step_time": 198.82232712500263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9966.0, "completions/mean_length": 2344.971923828125, "completions/mean_terminated_length": 2223.463623046875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.13647818565368652, "epoch": 0.08477970627503338, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.1356148600655986e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 326719774.0, "reward": 0.6698030829429626, "reward_std": 0.3651942014694214, "rewards/TRLRewardAdapter/mean": 0.6698030233383179, "rewards/TRLRewardAdapter/std": 0.365194171667099, "sampling/importance_sampling_ratio/max": 1.3290491104125977, "sampling/importance_sampling_ratio/mean": 0.04404326155781746, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.68427085876465, "sampling/sampling_logp_difference/mean": 0.01775551214814186, "step": 127, "step_time": 340.3491854190361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03854166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9839.0, "completions/mean_length": 1486.6334228515625, "completions/mean_terminated_length": 1145.3607177734375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.11044824620087941, "epoch": 0.08544726301735647, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.594325649466259e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 328611454.0, "reward": 0.7845466732978821, "reward_std": 0.3377356231212616, "rewards/TRLRewardAdapter/mean": 0.7845466732978821, "rewards/TRLRewardAdapter/std": 0.337735652923584, "sampling/importance_sampling_ratio/max": 1.3770049810409546, "sampling/importance_sampling_ratio/mean": 0.08728305995464325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.25, "sampling/sampling_logp_difference/mean": 0.015297630801796913, "step": 128, "step_time": 287.27334287704434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9973.0, "completions/mean_length": 1742.61572265625, "completions/mean_terminated_length": 1664.469970703125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.13573889682690302, "epoch": 0.08611481975967958, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.844913070956158e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 330725101.0, "reward": 0.7157770991325378, "reward_std": 0.3767005503177643, "rewards/TRLRewardAdapter/mean": 0.7157770991325378, "rewards/TRLRewardAdapter/std": 0.3767005503177643, "sampling/importance_sampling_ratio/max": 0.9500621557235718, "sampling/importance_sampling_ratio/mean": 0.04412941262125969, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.746208190917969, "sampling/sampling_logp_difference/mean": 0.017548177391290665, "step": 129, "step_time": 343.54249943909235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9873.0, "completions/mean_length": 1818.7490234375, "completions/mean_terminated_length": 1715.188720703125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.14987633377313614, "epoch": 0.08678237650200267, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.8260770066058735e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 332961660.0, "reward": 0.6647705435752869, "reward_std": 0.3835143744945526, "rewards/TRLRewardAdapter/mean": 0.6647704839706421, "rewards/TRLRewardAdapter/std": 0.3835143744945526, "sampling/importance_sampling_ratio/max": 1.551910638809204, "sampling/importance_sampling_ratio/mean": 0.05130656808614731, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.5, "sampling/sampling_logp_difference/mean": 0.019407054409384727, "step": 130, "step_time": 280.8307554081548 }, { "epoch": 0.08678237650200267, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.029782607908482136, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9816.95652173913, "eval_completions/mean_length": 2468.4951808763585, "eval_completions/mean_terminated_length": 2237.3007600203805, "eval_completions/min_length": 133.2173913043478, "eval_completions/min_terminated_length": 133.2173913043478, "eval_entropy": 0.15530690874742425, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 332961660.0, "eval_reward": 0.6874672666839932, "eval_reward_std": 0.3613548784152321, "eval_rewards/TRLRewardAdapter/mean": 0.687467274458512, "eval_rewards/TRLRewardAdapter/std": 0.3613548797109853, "eval_runtime": 1464.4291, "eval_samples_per_second": 3.118, "eval_sampling/importance_sampling_ratio/max": 0.9168462986531465, "eval_sampling/importance_sampling_ratio/mean": 0.05230657383799553, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 12.392858111340066, "eval_sampling/sampling_logp_difference/mean": 0.019455165237836216, "eval_steps_per_second": 0.016, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666883975267, "completions/max_length": 10000.0, "completions/max_terminated_length": 9899.0, "completions/mean_length": 1566.2021484375, "completions/mean_terminated_length": 1530.9141845703125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.1163618229329586, "epoch": 0.08744993324432576, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 9.990400794582324e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 334911806.0, "reward": 0.7543298602104187, "reward_std": 0.33369165658950806, "rewards/TRLRewardAdapter/mean": 0.7543298006057739, "rewards/TRLRewardAdapter/std": 0.33369165658950806, "sampling/importance_sampling_ratio/max": 1.1147093772888184, "sampling/importance_sampling_ratio/mean": 0.04032627493143082, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.75, "sampling/sampling_logp_difference/mean": 0.015747634693980217, "step": 131, "step_time": 249.80597927817143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9428.0, "completions/mean_length": 2161.057373046875, "completions/mean_terminated_length": 1899.4779052734375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.1256259394188722, "epoch": 0.08811748998664887, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 8.107188141294206e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 337413717.0, "reward": 0.6780837774276733, "reward_std": 0.36943140625953674, "rewards/TRLRewardAdapter/mean": 0.6780837178230286, "rewards/TRLRewardAdapter/std": 0.36943134665489197, "sampling/importance_sampling_ratio/max": 0.8488670587539673, "sampling/importance_sampling_ratio/mean": 0.037174295634031296, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.498069763183594, "sampling/sampling_logp_difference/mean": 0.016485383734107018, "step": 132, "step_time": 359.2915428769775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9995.0, "completions/mean_length": 2370.617919921875, "completions/mean_terminated_length": 2224.83349609375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.13913450141747793, "epoch": 0.08878504672897196, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.45982275115606e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 340171206.0, "reward": 0.7048442959785461, "reward_std": 0.3529007136821747, "rewards/TRLRewardAdapter/mean": 0.7048442959785461, "rewards/TRLRewardAdapter/std": 0.3529007136821747, "sampling/importance_sampling_ratio/max": 1.279177188873291, "sampling/importance_sampling_ratio/mean": 0.04539105296134949, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.212902069091797, "sampling/sampling_logp_difference/mean": 0.018074048683047295, "step": 133, "step_time": 308.4134695229586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9941.0, "completions/mean_length": 2043.0125732421875, "completions/mean_terminated_length": 1795.157958984375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.12847655887405077, "epoch": 0.08945260347129506, "frac_reward_zero_std": 0.0, "grad_norm": 0.00021827064679739243, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 342574450.0, "reward": 0.7504717707633972, "reward_std": 0.3379877805709839, "rewards/TRLRewardAdapter/mean": 0.7504717707633972, "rewards/TRLRewardAdapter/std": 0.3379877507686615, "sampling/importance_sampling_ratio/max": 0.7730284333229065, "sampling/importance_sampling_ratio/mean": 0.046417154371738434, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.059268474578857, "sampling/sampling_logp_difference/mean": 0.017021289095282555, "step": 134, "step_time": 331.23659227683675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9954.0, "completions/mean_length": 1845.822998046875, "completions/mean_terminated_length": 1537.2864990234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1348555808266004, "epoch": 0.09012016021361816, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.4494569170695323e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 344759368.0, "reward": 0.7522239089012146, "reward_std": 0.3312745690345764, "rewards/TRLRewardAdapter/mean": 0.7522238492965698, "rewards/TRLRewardAdapter/std": 0.3312745988368988, "sampling/importance_sampling_ratio/max": 1.501960039138794, "sampling/importance_sampling_ratio/mean": 0.04803052544593811, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.0, "sampling/sampling_logp_difference/mean": 0.018077347427606583, "step": 135, "step_time": 397.3124769788701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9958.0, "completions/mean_length": 2621.73974609375, "completions/mean_terminated_length": 2350.83154296875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.16868794709444046, "epoch": 0.09078771695594126, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.96205465517862e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 347741998.0, "reward": 0.6161584258079529, "reward_std": 0.4030417203903198, "rewards/TRLRewardAdapter/mean": 0.6161584258079529, "rewards/TRLRewardAdapter/std": 0.4030417203903198, "sampling/importance_sampling_ratio/max": 0.704612672328949, "sampling/importance_sampling_ratio/mean": 0.03168671950697899, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.25, "sampling/sampling_logp_difference/mean": 0.021658172830939293, "step": 136, "step_time": 376.7566877650097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01666666753590107, "completions/max_length": 10000.0, "completions/max_terminated_length": 9907.0, "completions/mean_length": 2204.33251953125, "completions/mean_terminated_length": 2072.202392578125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.12537605315446854, "epoch": 0.09145527369826435, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001455457796766848, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 350326541.0, "reward": 0.6935116648674011, "reward_std": 0.3763260245323181, "rewards/TRLRewardAdapter/mean": 0.6935116648674011, "rewards/TRLRewardAdapter/std": 0.3763260543346405, "sampling/importance_sampling_ratio/max": 1.673742413520813, "sampling/importance_sampling_ratio/mean": 0.07849831879138947, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.0, "sampling/sampling_logp_difference/mean": 0.016588479280471802, "step": 137, "step_time": 318.9485376807861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9335.0, "completions/mean_length": 1607.5709228515625, "completions/mean_terminated_length": 1590.0501708984375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.13056512425343195, "epoch": 0.09212283044058744, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.17332719009416e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 352318161.0, "reward": 0.7119481563568115, "reward_std": 0.3616694211959839, "rewards/TRLRewardAdapter/mean": 0.7119481563568115, "rewards/TRLRewardAdapter/std": 0.3616694211959839, "sampling/importance_sampling_ratio/max": 1.9035117626190186, "sampling/importance_sampling_ratio/mean": 0.06594683229923248, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.25, "sampling/sampling_logp_difference/mean": 0.017519645392894745, "step": 138, "step_time": 195.15077394398395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06354167312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9946.0, "completions/mean_length": 2349.66259765625, "completions/mean_terminated_length": 1830.562744140625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.12953656911849976, "epoch": 0.09279038718291055, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00033734750832134183, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 355030573.0, "reward": 0.6989985704421997, "reward_std": 0.3801172077655792, "rewards/TRLRewardAdapter/mean": 0.6989985108375549, "rewards/TRLRewardAdapter/std": 0.3801172375679016, "sampling/importance_sampling_ratio/max": 1.576914668083191, "sampling/importance_sampling_ratio/mean": 0.06069541722536087, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.137245178222656, "sampling/sampling_logp_difference/mean": 0.016589831560850143, "step": 139, "step_time": 377.3417408228852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9972.0, "completions/mean_length": 2159.079345703125, "completions/mean_terminated_length": 1791.40234375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.13095690310001373, "epoch": 0.09345794392523364, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00020026362079222687, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 357574521.0, "reward": 0.7151730060577393, "reward_std": 0.385638564825058, "rewards/TRLRewardAdapter/mean": 0.7151729464530945, "rewards/TRLRewardAdapter/std": 0.38563859462738037, "sampling/importance_sampling_ratio/max": 1.1447616815567017, "sampling/importance_sampling_ratio/mean": 0.07236989587545395, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.75, "sampling/sampling_logp_difference/mean": 0.017540689557790756, "step": 140, "step_time": 383.9879429220455 }, { "epoch": 0.09345794392523364, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.028478260152041912, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9567.0, "eval_completions/mean_length": 2355.380169412364, "eval_completions/mean_terminated_length": 2131.3561268682065, "eval_completions/min_length": 125.6086956521739, "eval_completions/min_terminated_length": 125.6086956521739, "eval_entropy": 0.15246716660002005, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 357574521.0, "eval_reward": 0.7001817407815353, "eval_reward_std": 0.3580330247464387, "eval_rewards/TRLRewardAdapter/mean": 0.7001817511475604, "eval_rewards/TRLRewardAdapter/std": 0.35803302215493243, "eval_runtime": 1455.4271, "eval_samples_per_second": 3.137, "eval_sampling/importance_sampling_ratio/max": 1.1529578592466272, "eval_sampling/importance_sampling_ratio/mean": 0.063112479351137, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.113044448520826, "eval_sampling/sampling_logp_difference/mean": 0.019226921641308327, "eval_steps_per_second": 0.016, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9976.0, "completions/mean_length": 2438.77001953125, "completions/mean_terminated_length": 2144.176513671875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.1497784579793612, "epoch": 0.09412550066755675, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.57565855321949e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 360400092.0, "reward": 0.6834118366241455, "reward_std": 0.36148059368133545, "rewards/TRLRewardAdapter/mean": 0.6834117770195007, "rewards/TRLRewardAdapter/std": 0.36148056387901306, "sampling/importance_sampling_ratio/max": 2.1518704891204834, "sampling/importance_sampling_ratio/mean": 0.05623745918273926, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.594276428222656, "sampling/sampling_logp_difference/mean": 0.019145064055919647, "step": 141, "step_time": 284.60477535484824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9771.0, "completions/mean_length": 2141.763671875, "completions/mean_terminated_length": 1800.1009521484375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.130792165795962, "epoch": 0.09479305740987984, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.989645746726181e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 362959257.0, "reward": 0.7391140460968018, "reward_std": 0.34996849298477173, "rewards/TRLRewardAdapter/mean": 0.739113986492157, "rewards/TRLRewardAdapter/std": 0.34996849298477173, "sampling/importance_sampling_ratio/max": 1.0413411855697632, "sampling/importance_sampling_ratio/mean": 0.046591490507125854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.139068603515625, "sampling/sampling_logp_difference/mean": 0.01730712503194809, "step": 142, "step_time": 393.3254542648792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9964.0, "completions/mean_length": 1811.565673828125, "completions/mean_terminated_length": 1538.323974609375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.13132629791895548, "epoch": 0.09546061415220294, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.212859534131486e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 365161272.0, "reward": 0.7331397533416748, "reward_std": 0.36269137263298035, "rewards/TRLRewardAdapter/mean": 0.7331397533416748, "rewards/TRLRewardAdapter/std": 0.36269137263298035, "sampling/importance_sampling_ratio/max": 0.5995067954063416, "sampling/importance_sampling_ratio/mean": 0.02724597416818142, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.101531982421875, "sampling/sampling_logp_difference/mean": 0.01708817295730114, "step": 143, "step_time": 244.08953675592784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9905.0, "completions/mean_length": 1664.17822265625, "completions/mean_terminated_length": 1567.5562744140625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.12960029890139899, "epoch": 0.09612817089452604, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 9.539925033539107e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 367164387.0, "reward": 0.758220911026001, "reward_std": 0.3305254578590393, "rewards/TRLRewardAdapter/mean": 0.7582208514213562, "rewards/TRLRewardAdapter/std": 0.3305254280567169, "sampling/importance_sampling_ratio/max": 1.5038800239562988, "sampling/importance_sampling_ratio/mean": 0.06554661691188812, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.112421035766602, "sampling/sampling_logp_difference/mean": 0.01695973426103592, "step": 144, "step_time": 249.5160663970746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01979166828095913, "completions/max_length": 10000.0, "completions/max_terminated_length": 9565.0, "completions/mean_length": 1791.877197265625, "completions/mean_terminated_length": 1626.14453125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.11942102139194806, "epoch": 0.09679572763684913, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 7.364744440420649e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 369396269.0, "reward": 0.7236497402191162, "reward_std": 0.36489593982696533, "rewards/TRLRewardAdapter/mean": 0.7236497402191162, "rewards/TRLRewardAdapter/std": 0.36489593982696533, "sampling/importance_sampling_ratio/max": 1.175023078918457, "sampling/importance_sampling_ratio/mean": 0.05448322743177414, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.263919830322266, "sampling/sampling_logp_difference/mean": 0.015838494524359703, "step": 145, "step_time": 318.82094568107277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9876.0, "completions/mean_length": 1528.6917724609375, "completions/mean_terminated_length": 1198.6407470703125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.11710108444094658, "epoch": 0.09746328437917223, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00023603061918306872, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 371304485.0, "reward": 0.7977306842803955, "reward_std": 0.3176351487636566, "rewards/TRLRewardAdapter/mean": 0.7977306246757507, "rewards/TRLRewardAdapter/std": 0.31763511896133423, "sampling/importance_sampling_ratio/max": 1.5682920217514038, "sampling/importance_sampling_ratio/mean": 0.055906809866428375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.45545196533203, "sampling/sampling_logp_difference/mean": 0.016202116385102272, "step": 146, "step_time": 390.41005204000976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9496.0, "completions/mean_length": 2014.49072265625, "completions/mean_terminated_length": 1730.216796875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.12879295150438944, "epoch": 0.09813084112149532, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.1310358895206175e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 373703996.0, "reward": 0.7048105001449585, "reward_std": 0.3717564642429352, "rewards/TRLRewardAdapter/mean": 0.7048104405403137, "rewards/TRLRewardAdapter/std": 0.3717564642429352, "sampling/importance_sampling_ratio/max": 1.3909571170806885, "sampling/importance_sampling_ratio/mean": 0.05960006266832352, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.035170555114746, "sampling/sampling_logp_difference/mean": 0.017401769757270813, "step": 147, "step_time": 391.8164426990552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9967.0, "completions/mean_length": 2415.05126953125, "completions/mean_terminated_length": 2136.553955078125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.13208756844202676, "epoch": 0.09879839786381843, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.467457801268445e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 376485165.0, "reward": 0.6942039132118225, "reward_std": 0.3856877088546753, "rewards/TRLRewardAdapter/mean": 0.6942039132118225, "rewards/TRLRewardAdapter/std": 0.3856877088546753, "sampling/importance_sampling_ratio/max": 1.076148509979248, "sampling/importance_sampling_ratio/mean": 0.0702730268239975, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.5, "sampling/sampling_logp_difference/mean": 0.017344176769256592, "step": 148, "step_time": 289.07324224791955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9823.0, "completions/mean_length": 1376.56884765625, "completions/mean_terminated_length": 1313.2276611328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.12632046143213907, "epoch": 0.09946595460614152, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.998420337660802e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 378230895.0, "reward": 0.7921905517578125, "reward_std": 0.31154507398605347, "rewards/TRLRewardAdapter/mean": 0.7921904921531677, "rewards/TRLRewardAdapter/std": 0.31154507398605347, "sampling/importance_sampling_ratio/max": 1.8002007007598877, "sampling/importance_sampling_ratio/mean": 0.08501424640417099, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.25, "sampling/sampling_logp_difference/mean": 0.017084039747714996, "step": 149, "step_time": 260.1785296329763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9994.0, "completions/mean_length": 1681.3802490234375, "completions/mean_terminated_length": 1366.6217041015625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.11658043290177982, "epoch": 0.10013351134846461, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00010239766064762541, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 380282972.0, "reward": 0.7411487698554993, "reward_std": 0.3671128749847412, "rewards/TRLRewardAdapter/mean": 0.7411487698554993, "rewards/TRLRewardAdapter/std": 0.3671128749847412, "sampling/importance_sampling_ratio/max": 2.462307929992676, "sampling/importance_sampling_ratio/mean": 0.07966098189353943, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.75, "sampling/sampling_logp_difference/mean": 0.01595127396285534, "step": 150, "step_time": 334.5855113271391 }, { "epoch": 0.10013351134846461, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.027826086091606514, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9677.91304347826, "eval_completions/mean_length": 2379.2464652683425, "eval_completions/mean_terminated_length": 2161.611386506454, "eval_completions/min_length": 122.73913043478261, "eval_completions/min_terminated_length": 122.73913043478261, "eval_entropy": 0.1519258747930112, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 380282972.0, "eval_reward": 0.6937050923057224, "eval_reward_std": 0.3625601387542227, "eval_rewards/TRLRewardAdapter/mean": 0.6937051104462665, "eval_rewards/TRLRewardAdapter/std": 0.36256014782449475, "eval_runtime": 1477.3959, "eval_samples_per_second": 3.091, "eval_sampling/importance_sampling_ratio/max": 1.1339411191318347, "eval_sampling/importance_sampling_ratio/mean": 0.06450947017773338, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.573608730150307, "eval_sampling/sampling_logp_difference/mean": 0.019229933133591778, "eval_steps_per_second": 0.016, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9773.0, "completions/mean_length": 1611.279296875, "completions/mean_terminated_length": 1514.044189453125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1055021844804287, "epoch": 0.10080106809078772, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.5331163112837194e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 382217384.0, "reward": 0.7711198329925537, "reward_std": 0.33840814232826233, "rewards/TRLRewardAdapter/mean": 0.7711198329925537, "rewards/TRLRewardAdapter/std": 0.33840814232826233, "sampling/importance_sampling_ratio/max": 2.915998697280884, "sampling/importance_sampling_ratio/mean": 0.10054993629455566, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 29.45545196533203, "sampling/sampling_logp_difference/mean": 0.014749045483767986, "step": 151, "step_time": 236.6893855878152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9945.0, "completions/mean_length": 1507.8521728515625, "completions/mean_terminated_length": 1427.484619140625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.12517911568284035, "epoch": 0.10146862483311081, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.962142543616558e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 384089018.0, "reward": 0.7268325686454773, "reward_std": 0.3861857056617737, "rewards/TRLRewardAdapter/mean": 0.7268325090408325, "rewards/TRLRewardAdapter/std": 0.3861857056617737, "sampling/importance_sampling_ratio/max": 0.9711188673973083, "sampling/importance_sampling_ratio/mean": 0.06700102984905243, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.95545196533203, "sampling/sampling_logp_difference/mean": 0.016836466267704964, "step": 152, "step_time": 299.85249807231594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9645.0, "completions/mean_length": 1574.14794921875, "completions/mean_terminated_length": 1413.1444091796875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.13228378693262735, "epoch": 0.10213618157543392, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.1240816477766067e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 385996296.0, "reward": 0.7366042137145996, "reward_std": 0.36069464683532715, "rewards/TRLRewardAdapter/mean": 0.7366041541099548, "rewards/TRLRewardAdapter/std": 0.36069464683532715, "sampling/importance_sampling_ratio/max": 1.8058819770812988, "sampling/importance_sampling_ratio/mean": 0.06281433254480362, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.219562530517578, "sampling/sampling_logp_difference/mean": 0.0171809084713459, "step": 153, "step_time": 275.29101295094006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9530.0, "completions/mean_length": 1446.46044921875, "completions/mean_terminated_length": 1310.68994140625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.10414906839529674, "epoch": 0.102803738317757, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0004272391739016317, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 387836258.0, "reward": 0.7876104116439819, "reward_std": 0.31673189997673035, "rewards/TRLRewardAdapter/mean": 0.7876103520393372, "rewards/TRLRewardAdapter/std": 0.31673189997673035, "sampling/importance_sampling_ratio/max": 0.9811664819717407, "sampling/importance_sampling_ratio/mean": 0.06324245780706406, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.205451965332031, "sampling/sampling_logp_difference/mean": 0.014519675634801388, "step": 154, "step_time": 244.34026735904627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05416667088866234, "completions/max_length": 10000.0, "completions/max_terminated_length": 9941.0, "completions/mean_length": 1777.2115478515625, "completions/mean_terminated_length": 1306.3028564453125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.11276281997561455, "epoch": 0.10347129506008011, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.51783215593763e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 389956973.0, "reward": 0.7846503257751465, "reward_std": 0.3251423239707947, "rewards/TRLRewardAdapter/mean": 0.7846502661705017, "rewards/TRLRewardAdapter/std": 0.3251423239707947, "sampling/importance_sampling_ratio/max": 1.4768052101135254, "sampling/importance_sampling_ratio/mean": 0.09205622971057892, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.71956443786621, "sampling/sampling_logp_difference/mean": 0.015325195156037807, "step": 155, "step_time": 398.74202973989304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9560.0, "completions/mean_length": 1603.4271240234375, "completions/mean_terminated_length": 1594.6715087890625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.13454734285672507, "epoch": 0.1041388518024032, "frac_reward_zero_std": 0.0, "grad_norm": 3.1351585816758735e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 391909959.0, "reward": 0.830170214176178, "reward_std": 0.2295290231704712, "rewards/TRLRewardAdapter/mean": 0.830170214176178, "rewards/TRLRewardAdapter/std": 0.2295290231704712, "sampling/importance_sampling_ratio/max": 1.3259351253509521, "sampling/importance_sampling_ratio/mean": 0.04692316800355911, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.969564437866211, "sampling/sampling_logp_difference/mean": 0.017764762043952942, "step": 156, "step_time": 151.47441462508868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333395421505, "completions/max_length": 10000.0, "completions/max_terminated_length": 9749.0, "completions/mean_length": 2186.6396484375, "completions/mean_terminated_length": 2020.3978271484375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.135329137245814, "epoch": 0.1048064085447263, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.419241183173897e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 394431277.0, "reward": 0.6653361916542053, "reward_std": 0.3968357443809509, "rewards/TRLRewardAdapter/mean": 0.6653361916542053, "rewards/TRLRewardAdapter/std": 0.39683568477630615, "sampling/importance_sampling_ratio/max": 1.545964241027832, "sampling/importance_sampling_ratio/mean": 0.11880049854516983, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.5, "sampling/sampling_logp_difference/mean": 0.01839216612279415, "step": 157, "step_time": 248.3765511349775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9775.0, "completions/mean_length": 2003.087646484375, "completions/mean_terminated_length": 1806.7918701171875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.11702617506186168, "epoch": 0.1054739652870494, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 6.585846838739693e-05, "learning_rate": 5e-06, "loss": 0.001, "num_tokens": 396819521.0, "reward": 0.6708590388298035, "reward_std": 0.40644270181655884, "rewards/TRLRewardAdapter/mean": 0.6708590388298035, "rewards/TRLRewardAdapter/std": 0.40644270181655884, "sampling/importance_sampling_ratio/max": 1.6557930707931519, "sampling/importance_sampling_ratio/mean": 0.07631010562181473, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.309269905090332, "sampling/sampling_logp_difference/mean": 0.01588190533220768, "step": 158, "step_time": 327.4894297491992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9816.0, "completions/mean_length": 1521.3896484375, "completions/mean_terminated_length": 1476.9990234375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.11458468188842137, "epoch": 0.10614152202937249, "frac_reward_zero_std": 0.0, "grad_norm": 9.832217706461293e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 398701559.0, "reward": 0.7800343036651611, "reward_std": 0.3128453195095062, "rewards/TRLRewardAdapter/mean": 0.7800342440605164, "rewards/TRLRewardAdapter/std": 0.3128453195095062, "sampling/importance_sampling_ratio/max": 1.2843810319900513, "sampling/importance_sampling_ratio/mean": 0.051958054304122925, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.28125, "sampling/sampling_logp_difference/mean": 0.01560790091753006, "step": 159, "step_time": 286.50617076898925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9853.0, "completions/mean_length": 1724.0010986328125, "completions/mean_terminated_length": 1663.2119140625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.12688563267389932, "epoch": 0.1068090787716956, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.006229290563662e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 400810936.0, "reward": 0.7313457727432251, "reward_std": 0.3640768229961395, "rewards/TRLRewardAdapter/mean": 0.7313457727432251, "rewards/TRLRewardAdapter/std": 0.3640768229961395, "sampling/importance_sampling_ratio/max": 1.986310601234436, "sampling/importance_sampling_ratio/mean": 0.10960646718740463, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.875, "sampling/sampling_logp_difference/mean": 0.016984816640615463, "step": 160, "step_time": 290.75331439613365 }, { "epoch": 0.1068090787716956, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0284782601925342, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9702.04347826087, "eval_completions/mean_length": 2352.8240860648775, "eval_completions/mean_terminated_length": 2128.8152120838995, "eval_completions/min_length": 114.30434782608695, "eval_completions/min_terminated_length": 114.30434782608695, "eval_entropy": 0.1525176024955252, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 400810936.0, "eval_reward": 0.6961859697880952, "eval_reward_std": 0.362205912237582, "eval_rewards/TRLRewardAdapter/mean": 0.6961859827456267, "eval_rewards/TRLRewardAdapter/std": 0.3622059187163477, "eval_runtime": 1452.2508, "eval_samples_per_second": 3.144, "eval_sampling/importance_sampling_ratio/max": 1.326102199761764, "eval_sampling/importance_sampling_ratio/mean": 0.07548981953574263, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.759410671565844, "eval_sampling/sampling_logp_difference/mean": 0.019364864524939785, "eval_steps_per_second": 0.016, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9954.0, "completions/mean_length": 1551.4813232421875, "completions/mean_terminated_length": 1047.9271240234375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.11241903280218442, "epoch": 0.10747663551401869, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0006817205751395026, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 402713638.0, "reward": 0.7665219902992249, "reward_std": 0.3447123169898987, "rewards/TRLRewardAdapter/mean": 0.7665219902992249, "rewards/TRLRewardAdapter/std": 0.3447123169898987, "sampling/importance_sampling_ratio/max": 1.3029905557632446, "sampling/importance_sampling_ratio/mean": 0.051413048058748245, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.469563007354736, "sampling/sampling_logp_difference/mean": 0.015512119047343731, "step": 161, "step_time": 394.698671505088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02500000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9924.0, "completions/mean_length": 2017.7510986328125, "completions/mean_terminated_length": 1813.078125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.11456204702456792, "epoch": 0.1081441922563418, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 7.848842516422998e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 405117591.0, "reward": 0.7282195091247559, "reward_std": 0.38265085220336914, "rewards/TRLRewardAdapter/mean": 0.7282194495201111, "rewards/TRLRewardAdapter/std": 0.38265082240104675, "sampling/importance_sampling_ratio/max": 0.9695773720741272, "sampling/importance_sampling_ratio/mean": 0.09382928907871246, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.213584899902344, "sampling/sampling_logp_difference/mean": 0.015914976596832275, "step": 162, "step_time": 312.2502204857301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 10000.0, "completions/max_terminated_length": 9915.0, "completions/mean_length": 1561.7042236328125, "completions/mean_terminated_length": 1472.8798828125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.12358297159274419, "epoch": 0.10881174899866489, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00010687441008406579, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 407003419.0, "reward": 0.6793323755264282, "reward_std": 0.40592122077941895, "rewards/TRLRewardAdapter/mean": 0.6793323755264282, "rewards/TRLRewardAdapter/std": 0.40592119097709656, "sampling/importance_sampling_ratio/max": 1.3988263607025146, "sampling/importance_sampling_ratio/mean": 0.11524154245853424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.53516960144043, "sampling/sampling_logp_difference/mean": 0.016662752255797386, "step": 163, "step_time": 249.21956890402362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02500000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9871.0, "completions/mean_length": 1939.759521484375, "completions/mean_terminated_length": 1733.086669921875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.09411596631010373, "epoch": 0.10947930574098798, "frac_reward_zero_std": 0.0, "grad_norm": 0.00010972651083887833, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 409316916.0, "reward": 0.7786769270896912, "reward_std": 0.30987218022346497, "rewards/TRLRewardAdapter/mean": 0.7786768674850464, "rewards/TRLRewardAdapter/std": 0.3098721504211426, "sampling/importance_sampling_ratio/max": 1.3678574562072754, "sampling/importance_sampling_ratio/mean": 0.09667681902647018, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.692296028137207, "sampling/sampling_logp_difference/mean": 0.013183471746742725, "step": 164, "step_time": 339.14866313082166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9592.0, "completions/mean_length": 1573.425048828125, "completions/mean_terminated_length": 1555.8330078125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.13461816310882568, "epoch": 0.11014686248331108, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 8.242464570565928e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 411284812.0, "reward": 0.7970845699310303, "reward_std": 0.2950076162815094, "rewards/TRLRewardAdapter/mean": 0.7970845699310303, "rewards/TRLRewardAdapter/std": 0.2950076162815094, "sampling/importance_sampling_ratio/max": 1.6130881309509277, "sampling/importance_sampling_ratio/mean": 0.0815000832080841, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.1818528175354, "sampling/sampling_logp_difference/mean": 0.01816798746585846, "step": 165, "step_time": 214.31559586594813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02500000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9964.0, "completions/mean_length": 1914.69384765625, "completions/mean_terminated_length": 1707.3782958984375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.12591932465632757, "epoch": 0.11081441922563418, "frac_reward_zero_std": 0.0, "grad_norm": 0.00012725181508851387, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 413642054.0, "reward": 0.7520473599433899, "reward_std": 0.3353344202041626, "rewards/TRLRewardAdapter/mean": 0.7520473599433899, "rewards/TRLRewardAdapter/std": 0.3353344202041626, "sampling/importance_sampling_ratio/max": 2.3689584732055664, "sampling/importance_sampling_ratio/mean": 0.07621227204799652, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.45545196533203, "sampling/sampling_logp_difference/mean": 0.016877667978405952, "step": 166, "step_time": 325.60074338386767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9705.0, "completions/mean_length": 2305.492919921875, "completions/mean_terminated_length": 1944.6815185546875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.14563562472661337, "epoch": 0.11148197596795728, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 6.142816370724688e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 416322303.0, "reward": 0.690281867980957, "reward_std": 0.3684394359588623, "rewards/TRLRewardAdapter/mean": 0.6902818083763123, "rewards/TRLRewardAdapter/std": 0.3684394061565399, "sampling/importance_sampling_ratio/max": 1.755854606628418, "sampling/importance_sampling_ratio/mean": 0.09705977141857147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.375, "sampling/sampling_logp_difference/mean": 0.01864590309560299, "step": 167, "step_time": 283.01568886695895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06979166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9830.0, "completions/mean_length": 1938.838623046875, "completions/mean_terminated_length": 1334.0257568359375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.13773382206757864, "epoch": 0.11214953271028037, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001225243812592436, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 418624068.0, "reward": 0.7478582262992859, "reward_std": 0.3696529269218445, "rewards/TRLRewardAdapter/mean": 0.7478581666946411, "rewards/TRLRewardAdapter/std": 0.3696529269218445, "sampling/importance_sampling_ratio/max": 1.6435810327529907, "sampling/importance_sampling_ratio/mean": 0.1233157068490982, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.205453872680664, "sampling/sampling_logp_difference/mean": 0.018473615869879723, "step": 168, "step_time": 399.46194162697066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9910.0, "completions/mean_length": 1889.5074462890625, "completions/mean_terminated_length": 1699.2825927734375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.15432061006625494, "epoch": 0.11281708945260348, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 8.429299319420713e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 420878219.0, "reward": 0.6872667670249939, "reward_std": 0.39916694164276123, "rewards/TRLRewardAdapter/mean": 0.6872667074203491, "rewards/TRLRewardAdapter/std": 0.39916691184043884, "sampling/importance_sampling_ratio/max": 2.780564308166504, "sampling/importance_sampling_ratio/mean": 0.13409915566444397, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.0, "sampling/sampling_logp_difference/mean": 0.019472500309348106, "step": 169, "step_time": 343.45780162513256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250002793967724, "completions/max_length": 10000.0, "completions/max_terminated_length": 9776.0, "completions/mean_length": 1756.3604736328125, "completions/mean_terminated_length": 1730.5181884765625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.14001645892858505, "epoch": 0.11348464619492657, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.3721163292165785e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 423003973.0, "reward": 0.685125470161438, "reward_std": 0.3935852348804474, "rewards/TRLRewardAdapter/mean": 0.6851254105567932, "rewards/TRLRewardAdapter/std": 0.3935852348804474, "sampling/importance_sampling_ratio/max": 1.5537537336349487, "sampling/importance_sampling_ratio/mean": 0.13283765316009521, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.966457843780518, "sampling/sampling_logp_difference/mean": 0.018683936446905136, "step": 170, "step_time": 164.62587461201474 }, { "epoch": 0.11348464619492657, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.03478260751327743, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9750.91304347826, "eval_completions/mean_length": 2378.2897312330165, "eval_completions/mean_terminated_length": 2103.494591754416, "eval_completions/min_length": 78.82608695652173, "eval_completions/min_terminated_length": 78.82608695652173, "eval_entropy": 0.15513058784215347, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 423003973.0, "eval_reward": 0.6909672410591788, "eval_reward_std": 0.37204208710919257, "eval_rewards/TRLRewardAdapter/mean": 0.6909672566082167, "eval_rewards/TRLRewardAdapter/std": 0.3720420845176863, "eval_runtime": 1471.393, "eval_samples_per_second": 3.103, "eval_sampling/importance_sampling_ratio/max": 1.287332788757656, "eval_sampling/importance_sampling_ratio/mean": 0.1068178363468336, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 15.720460176467896, "eval_sampling/sampling_logp_difference/mean": 0.019769671899469002, "eval_steps_per_second": 0.016, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9151.0, "completions/max_terminated_length": 9151.0, "completions/mean_length": 1063.89794921875, "completions/mean_terminated_length": 1063.89794921875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.11908687154452006, "epoch": 0.11415220293724966, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00026745912275093827, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 424437091.0, "reward": 0.8286584615707397, "reward_std": 0.2696155607700348, "rewards/TRLRewardAdapter/mean": 0.828658401966095, "rewards/TRLRewardAdapter/std": 0.2696155607700348, "sampling/importance_sampling_ratio/max": 2.8751087188720703, "sampling/importance_sampling_ratio/mean": 0.10444097220897675, "sampling/importance_sampling_ratio/min": 2.7927878393993604e-42, "sampling/sampling_logp_difference/max": 4.5, "sampling/sampling_logp_difference/mean": 0.016071438789367676, "step": 171, "step_time": 95.64353674114682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0989583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9922.0, "completions/mean_length": 3197.195068359375, "completions/mean_terminated_length": 2450.06591796875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1374619280298551, "epoch": 0.11481975967957277, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 9.350555646090139e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 427960062.0, "reward": 0.6007449626922607, "reward_std": 0.4292110800743103, "rewards/TRLRewardAdapter/mean": 0.6007449626922607, "rewards/TRLRewardAdapter/std": 0.4292110800743103, "sampling/importance_sampling_ratio/max": 1.2448867559432983, "sampling/importance_sampling_ratio/mean": 0.060755275189876556, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.63402557373047, "sampling/sampling_logp_difference/mean": 0.018138069659471512, "step": 172, "step_time": 379.78182051493786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09062500298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9959.0, "completions/mean_length": 2627.431396484375, "completions/mean_terminated_length": 1892.7080078125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.12334443007906278, "epoch": 0.11548731642189586, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 5.95191764565004e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 430949212.0, "reward": 0.6271807551383972, "reward_std": 0.4327279329299927, "rewards/TRLRewardAdapter/mean": 0.6271807551383972, "rewards/TRLRewardAdapter/std": 0.4327279329299927, "sampling/importance_sampling_ratio/max": 1.8865550756454468, "sampling/importance_sampling_ratio/mean": 0.1162252426147461, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.75, "sampling/sampling_logp_difference/mean": 0.016925811767578125, "step": 173, "step_time": 405.15434820216615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9809.0, "completions/mean_length": 1791.9344482421875, "completions/mean_terminated_length": 1599.421142578125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1439859022696813, "epoch": 0.11615487316421896, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 8.652375792364598e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 433096413.0, "reward": 0.6881095767021179, "reward_std": 0.40231311321258545, "rewards/TRLRewardAdapter/mean": 0.6881095170974731, "rewards/TRLRewardAdapter/std": 0.40231311321258545, "sampling/importance_sampling_ratio/max": 1.613453984260559, "sampling/importance_sampling_ratio/mean": 0.11005332320928574, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.5, "sampling/sampling_logp_difference/mean": 0.019123367965221405, "step": 174, "step_time": 314.5446395711042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9617.0, "completions/mean_length": 1170.67822265625, "completions/mean_terminated_length": 1161.4713134765625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.12343673532207806, "epoch": 0.11682242990654206, "frac_reward_zero_std": 0.0, "grad_norm": 4.957837660371401e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 434656200.0, "reward": 0.8631541132926941, "reward_std": 0.23277562856674194, "rewards/TRLRewardAdapter/mean": 0.8631541132926941, "rewards/TRLRewardAdapter/std": 0.23277562856674194, "sampling/importance_sampling_ratio/max": 1.5887049436569214, "sampling/importance_sampling_ratio/mean": 0.1320357620716095, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.34498405456543, "sampling/sampling_logp_difference/mean": 0.01685042679309845, "step": 175, "step_time": 188.68177357304376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9920.0, "completions/mean_length": 1944.479248046875, "completions/mean_terminated_length": 1630.627685546875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.13745656112829843, "epoch": 0.11748998664886515, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001248822432544961, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 436951220.0, "reward": 0.718528151512146, "reward_std": 0.37853848934173584, "rewards/TRLRewardAdapter/mean": 0.7185280919075012, "rewards/TRLRewardAdapter/std": 0.3785385191440582, "sampling/importance_sampling_ratio/max": 1.4728782176971436, "sampling/importance_sampling_ratio/mean": 0.10399536043405533, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.75, "sampling/sampling_logp_difference/mean": 0.018090328201651573, "step": 176, "step_time": 368.7817706208443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9744.0, "completions/mean_length": 2219.322021484375, "completions/mean_terminated_length": 1924.9178466796875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.12368625029921532, "epoch": 0.11815754339118825, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.0778792091391825e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 439536009.0, "reward": 0.6815489530563354, "reward_std": 0.3821203410625458, "rewards/TRLRewardAdapter/mean": 0.6815489530563354, "rewards/TRLRewardAdapter/std": 0.3821203112602234, "sampling/importance_sampling_ratio/max": 1.85831618309021, "sampling/importance_sampling_ratio/mean": 0.07373838871717453, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.75, "sampling/sampling_logp_difference/mean": 0.016376739367842674, "step": 177, "step_time": 383.43573823885527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07187500596046448, "completions/max_length": 10000.0, "completions/max_terminated_length": 9987.0, "completions/mean_length": 2250.966796875, "completions/mean_terminated_length": 1650.873291015625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.12446353708704312, "epoch": 0.11882510013351134, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00017975583513250062, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 442175657.0, "reward": 0.6742488741874695, "reward_std": 0.4120146930217743, "rewards/TRLRewardAdapter/mean": 0.6742488145828247, "rewards/TRLRewardAdapter/std": 0.4120146930217743, "sampling/importance_sampling_ratio/max": 1.1795647144317627, "sampling/importance_sampling_ratio/mean": 0.07377847284078598, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.75, "sampling/sampling_logp_difference/mean": 0.01679581217467785, "step": 178, "step_time": 398.78103870199993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9478.0, "completions/mean_length": 1137.3250732421875, "completions/mean_terminated_length": 1072.2265625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.12555034582813582, "epoch": 0.11949265687583445, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001250500047722847, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 443657569.0, "reward": 0.8061574697494507, "reward_std": 0.33637648820877075, "rewards/TRLRewardAdapter/mean": 0.8061574101448059, "rewards/TRLRewardAdapter/std": 0.33637648820877075, "sampling/importance_sampling_ratio/max": 2.3244686126708984, "sampling/importance_sampling_ratio/mean": 0.15397408604621887, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.358428955078125, "sampling/sampling_logp_difference/mean": 0.017526661977171898, "step": 179, "step_time": 296.38614066294394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9780.0, "completions/mean_length": 2336.485595703125, "completions/mean_terminated_length": 1977.1275634765625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.1303736021121343, "epoch": 0.12016021361815754, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.717001565466574e-05, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 446383347.0, "reward": 0.6712610125541687, "reward_std": 0.38799822330474854, "rewards/TRLRewardAdapter/mean": 0.6712609529495239, "rewards/TRLRewardAdapter/std": 0.38799822330474854, "sampling/importance_sampling_ratio/max": 1.9918242692947388, "sampling/importance_sampling_ratio/mean": 0.11425326019525528, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.208984375, "sampling/sampling_logp_difference/mean": 0.017097637057304382, "step": 180, "step_time": 333.5126931448467 }, { "epoch": 0.12016021361815754, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.03521739031471636, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9715.95652173913, "eval_completions/mean_length": 2360.693019701087, "eval_completions/mean_terminated_length": 2081.8454377547555, "eval_completions/min_length": 74.21739130434783, "eval_completions/min_terminated_length": 74.21739130434783, "eval_entropy": 0.15638975928659024, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 446383347.0, "eval_reward": 0.6934679902118185, "eval_reward_std": 0.37024089564447815, "eval_rewards/TRLRewardAdapter/mean": 0.6934679979863374, "eval_rewards/TRLRewardAdapter/std": 0.3702408995317376, "eval_runtime": 1469.094, "eval_samples_per_second": 3.108, "eval_sampling/importance_sampling_ratio/max": 1.5733484962712163, "eval_sampling/importance_sampling_ratio/mean": 0.12031055662942969, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 13.339880176212477, "eval_sampling/sampling_logp_difference/mean": 0.019986141392070313, "eval_steps_per_second": 0.016, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9958.0, "completions/mean_length": 1937.619873046875, "completions/mean_terminated_length": 1809.6455078125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.11436842133601506, "epoch": 0.12082777036048065, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0004729986872480162, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 448684262.0, "reward": 0.7022427320480347, "reward_std": 0.38038793206214905, "rewards/TRLRewardAdapter/mean": 0.7022427320480347, "rewards/TRLRewardAdapter/std": 0.3803878724575043, "sampling/importance_sampling_ratio/max": 1.6211986541748047, "sampling/importance_sampling_ratio/mean": 0.11462689936161041, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.75, "sampling/sampling_logp_difference/mean": 0.01598750613629818, "step": 181, "step_time": 258.8561512280721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9644.0, "completions/mean_length": 1697.9854736328125, "completions/mean_terminated_length": 1619.4173583984375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.15524009615182877, "epoch": 0.12149532710280374, "frac_reward_zero_std": 0.0, "grad_norm": 8.227240669000627e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 450773976.0, "reward": 0.7843005657196045, "reward_std": 0.3007963299751282, "rewards/TRLRewardAdapter/mean": 0.7843005657196045, "rewards/TRLRewardAdapter/std": 0.3007963299751282, "sampling/importance_sampling_ratio/max": 1.528947353363037, "sampling/importance_sampling_ratio/mean": 0.06349576264619827, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.5, "sampling/sampling_logp_difference/mean": 0.020133279263973236, "step": 182, "step_time": 211.79882701404858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 9723.0, "completions/mean_length": 2005.5635986328125, "completions/mean_terminated_length": 1747.6785888671875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.1179102895160516, "epoch": 0.12216288384512683, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 6.047085814737633e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 453088437.0, "reward": 0.688823938369751, "reward_std": 0.394115686416626, "rewards/TRLRewardAdapter/mean": 0.6888238787651062, "rewards/TRLRewardAdapter/std": 0.394115686416626, "sampling/importance_sampling_ratio/max": 2.64967942237854, "sampling/importance_sampling_ratio/mean": 0.12295623868703842, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.5, "sampling/sampling_logp_difference/mean": 0.016105731949210167, "step": 183, "step_time": 385.4205216129776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9928.0, "completions/mean_length": 1802.916748046875, "completions/mean_terminated_length": 1742.7071533203125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.13280471165974936, "epoch": 0.12283044058744993, "frac_reward_zero_std": 0.0, "grad_norm": 2.0303571836287468e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 455278693.0, "reward": 0.7768964171409607, "reward_std": 0.31320250034332275, "rewards/TRLRewardAdapter/mean": 0.7768963575363159, "rewards/TRLRewardAdapter/std": 0.31320247054100037, "sampling/importance_sampling_ratio/max": 2.2319748401641846, "sampling/importance_sampling_ratio/mean": 0.08985724300146103, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.625, "sampling/sampling_logp_difference/mean": 0.017921578139066696, "step": 184, "step_time": 339.88350935583003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9966.0, "completions/mean_length": 1277.6063232421875, "completions/mean_terminated_length": 1139.155517578125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.11982540786266327, "epoch": 0.12349799732977303, "frac_reward_zero_std": 0.0, "grad_norm": 0.0004378204180766612, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 456943083.0, "reward": 0.8398846983909607, "reward_std": 0.25813421607017517, "rewards/TRLRewardAdapter/mean": 0.8398846387863159, "rewards/TRLRewardAdapter/std": 0.25813421607017517, "sampling/importance_sampling_ratio/max": 2.5287773609161377, "sampling/importance_sampling_ratio/mean": 0.18552328646183014, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.498069763183594, "sampling/sampling_logp_difference/mean": 0.01603524573147297, "step": 185, "step_time": 299.5427632268984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9981.0, "completions/mean_length": 2458.84912109375, "completions/mean_terminated_length": 2009.3763427734375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.13629320512215296, "epoch": 0.12416555407209613, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 4.9603828677109936e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 459730810.0, "reward": 0.7173424363136292, "reward_std": 0.3475840985774994, "rewards/TRLRewardAdapter/mean": 0.7173423767089844, "rewards/TRLRewardAdapter/std": 0.3475840985774994, "sampling/importance_sampling_ratio/max": 1.458384394645691, "sampling/importance_sampling_ratio/mean": 0.10778453201055527, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.40446472167969, "sampling/sampling_logp_difference/mean": 0.01772146485745907, "step": 186, "step_time": 386.3632582979044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9834.0, "completions/mean_length": 2279.40625, "completions/mean_terminated_length": 2181.677001953125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.14334231615066528, "epoch": 0.12483311081441922, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00020397936462264804, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 462385824.0, "reward": 0.6299487948417664, "reward_std": 0.4165264964103699, "rewards/TRLRewardAdapter/mean": 0.6299487352371216, "rewards/TRLRewardAdapter/std": 0.41652652621269226, "sampling/importance_sampling_ratio/max": 1.8486647605895996, "sampling/importance_sampling_ratio/mean": 0.08643887937068939, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.639068603515625, "sampling/sampling_logp_difference/mean": 0.01908588968217373, "step": 187, "step_time": 239.4411448261235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04583333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9715.0, "completions/mean_length": 1492.576171875, "completions/mean_terminated_length": 1083.9224853515625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.11953786015510559, "epoch": 0.12550066755674233, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.355019319037353e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 464264297.0, "reward": 0.7966182827949524, "reward_std": 0.32988008856773376, "rewards/TRLRewardAdapter/mean": 0.7966182827949524, "rewards/TRLRewardAdapter/std": 0.32988008856773376, "sampling/importance_sampling_ratio/max": 1.9703387022018433, "sampling/importance_sampling_ratio/mean": 0.11671576648950577, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.0, "sampling/sampling_logp_difference/mean": 0.01679096557199955, "step": 188, "step_time": 282.49522982677445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9944.0, "completions/mean_length": 1752.8448486328125, "completions/mean_terminated_length": 1394.272705078125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.1357521985967954, "epoch": 0.1261682242990654, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.959730483265012e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 466386164.0, "reward": 0.774365246295929, "reward_std": 0.3353410065174103, "rewards/TRLRewardAdapter/mean": 0.774365246295929, "rewards/TRLRewardAdapter/std": 0.3353409767150879, "sampling/importance_sampling_ratio/max": 2.2213077545166016, "sampling/importance_sampling_ratio/mean": 0.11746090650558472, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.503543853759766, "sampling/sampling_logp_difference/mean": 0.017650311812758446, "step": 189, "step_time": 393.9472883648705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04791666939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9952.0, "completions/mean_length": 2099.951171875, "completions/mean_terminated_length": 1702.3555908203125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.1336199715733528, "epoch": 0.1268357810413885, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 1.1018491010008453e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 468849445.0, "reward": 0.704240083694458, "reward_std": 0.3940180838108063, "rewards/TRLRewardAdapter/mean": 0.7042400240898132, "rewards/TRLRewardAdapter/std": 0.3940180540084839, "sampling/importance_sampling_ratio/max": 1.7454073429107666, "sampling/importance_sampling_ratio/mean": 0.138761505484581, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.18427085876465, "sampling/sampling_logp_difference/mean": 0.01806732639670372, "step": 190, "step_time": 381.58433087693993 }, { "epoch": 0.1268357810413885, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.03152173810193072, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9728.826086956522, "eval_completions/mean_length": 2301.4821087381115, "eval_completions/mean_terminated_length": 2051.183630901834, "eval_completions/min_length": 67.30434782608695, "eval_completions/min_terminated_length": 67.30434782608695, "eval_entropy": 0.15593015953250552, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 468849445.0, "eval_reward": 0.692557912805806, "eval_reward_std": 0.373788657395736, "eval_rewards/TRLRewardAdapter/mean": 0.6925579309463501, "eval_rewards/TRLRewardAdapter/std": 0.37378866387450177, "eval_runtime": 1464.7891, "eval_samples_per_second": 3.117, "eval_sampling/importance_sampling_ratio/max": 1.5481649585392163, "eval_sampling/importance_sampling_ratio/mean": 0.12751681228046832, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 12.812516689300537, "eval_sampling/sampling_logp_difference/mean": 0.019935898725753246, "eval_steps_per_second": 0.016, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333767950535, "completions/max_length": 10000.0, "completions/max_terminated_length": 9961.0, "completions/mean_length": 1946.8740234375, "completions/mean_terminated_length": 1879.20068359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.12434505174557368, "epoch": 0.12750333778371162, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.6645100421268675e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 471205580.0, "reward": 0.7731790542602539, "reward_std": 0.3208652138710022, "rewards/TRLRewardAdapter/mean": 0.7731789946556091, "rewards/TRLRewardAdapter/std": 0.3208652138710022, "sampling/importance_sampling_ratio/max": 2.0625455379486084, "sampling/importance_sampling_ratio/mean": 0.13667091727256775, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.39455795288086, "sampling/sampling_logp_difference/mean": 0.0168630238622427, "step": 191, "step_time": 275.7678975200979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9498.0, "completions/mean_length": 1280.652099609375, "completions/mean_terminated_length": 1262.4488525390625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.13023329029480615, "epoch": 0.12817089452603472, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008121199486142648, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 472901470.0, "reward": 0.7892448902130127, "reward_std": 0.3289344012737274, "rewards/TRLRewardAdapter/mean": 0.7892448306083679, "rewards/TRLRewardAdapter/std": 0.3289344012737274, "sampling/importance_sampling_ratio/max": 2.5493242740631104, "sampling/importance_sampling_ratio/mean": 0.11255039274692535, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.86874771118164, "sampling/sampling_logp_difference/mean": 0.017277738079428673, "step": 192, "step_time": 174.32445088005625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01666666753590107, "completions/max_length": 10000.0, "completions/max_terminated_length": 9664.0, "completions/mean_length": 1931.4896240234375, "completions/mean_terminated_length": 1794.7352294921875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.11334129174550374, "epoch": 0.1288384512683578, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00018217831673042164, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 475200628.0, "reward": 0.698695182800293, "reward_std": 0.36349397897720337, "rewards/TRLRewardAdapter/mean": 0.6986951231956482, "rewards/TRLRewardAdapter/std": 0.36349397897720337, "sampling/importance_sampling_ratio/max": 2.0054123401641846, "sampling/importance_sampling_ratio/mean": 0.08561547100543976, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.763919830322266, "sampling/sampling_logp_difference/mean": 0.015243276953697205, "step": 193, "step_time": 220.42240917601157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 9881.0, "completions/mean_length": 1636.3751220703125, "completions/mean_terminated_length": 1583.7735595703125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.12291658048828442, "epoch": 0.1295060080106809, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00021269483133117578, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 477245884.0, "reward": 0.7543378472328186, "reward_std": 0.3365703225135803, "rewards/TRLRewardAdapter/mean": 0.7543378472328186, "rewards/TRLRewardAdapter/std": 0.3365703225135803, "sampling/importance_sampling_ratio/max": 2.3275351524353027, "sampling/importance_sampling_ratio/mean": 0.13820452988147736, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.28516960144043, "sampling/sampling_logp_difference/mean": 0.016706937924027443, "step": 194, "step_time": 269.77215437218547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9737.0, "completions/mean_length": 2282.954345703125, "completions/mean_terminated_length": 1964.8980712890625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.12832057724396387, "epoch": 0.130173564753004, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.00020799667823885346, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 479914608.0, "reward": 0.6932317018508911, "reward_std": 0.368312269449234, "rewards/TRLRewardAdapter/mean": 0.6932316422462463, "rewards/TRLRewardAdapter/std": 0.3683122396469116, "sampling/importance_sampling_ratio/max": 1.5257395505905151, "sampling/importance_sampling_ratio/mean": 0.11392951011657715, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.025718688964844, "sampling/sampling_logp_difference/mean": 0.01679644174873829, "step": 195, "step_time": 324.4559900170425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9971.0, "completions/mean_length": 2108.633544921875, "completions/mean_terminated_length": 2050.66943359375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.13398372878630957, "epoch": 0.1308411214953271, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.847530107096583e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 482443152.0, "reward": 0.7125573754310608, "reward_std": 0.3508776128292084, "rewards/TRLRewardAdapter/mean": 0.7125573754310608, "rewards/TRLRewardAdapter/std": 0.3508776128292084, "sampling/importance_sampling_ratio/max": 1.6037027835845947, "sampling/importance_sampling_ratio/mean": 0.0900057703256607, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.612421035766602, "sampling/sampling_logp_difference/mean": 0.017937107011675835, "step": 196, "step_time": 283.4464213917963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9871.0, "completions/mean_length": 2211.438720703125, "completions/mean_terminated_length": 1881.6297607421875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.14496001849571863, "epoch": 0.1315086782376502, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.96752140766997e-05, "learning_rate": 5e-06, "loss": 0.0007, "num_tokens": 485043573.0, "reward": 0.6677923202514648, "reward_std": 0.38336098194122314, "rewards/TRLRewardAdapter/mean": 0.6677922606468201, "rewards/TRLRewardAdapter/std": 0.38336098194122314, "sampling/importance_sampling_ratio/max": 2.530822277069092, "sampling/importance_sampling_ratio/mean": 0.07543853670358658, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.15638732910156, "sampling/sampling_logp_difference/mean": 0.018978912383317947, "step": 197, "step_time": 397.19266091822647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06354167312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9956.0, "completions/mean_length": 2705.032470703125, "completions/mean_terminated_length": 2210.04541015625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.13534817596276602, "epoch": 0.1321762349799733, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00017931896916693345, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 488072052.0, "reward": 0.5956633687019348, "reward_std": 0.4256119430065155, "rewards/TRLRewardAdapter/mean": 0.59566330909729, "rewards/TRLRewardAdapter/std": 0.4256119132041931, "sampling/importance_sampling_ratio/max": 1.402414083480835, "sampling/importance_sampling_ratio/mean": 0.11498638242483139, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.5, "sampling/sampling_logp_difference/mean": 0.01710636168718338, "step": 198, "step_time": 369.43524990580045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9981.0, "completions/mean_length": 1737.384521484375, "completions/mean_terminated_length": 1676.6934814453125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.14282888919115067, "epoch": 0.1328437917222964, "frac_reward_zero_std": 0.0, "grad_norm": 6.805470089687317e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 490173317.0, "reward": 0.7703692317008972, "reward_std": 0.34410467743873596, "rewards/TRLRewardAdapter/mean": 0.7703692317008972, "rewards/TRLRewardAdapter/std": 0.3441046476364136, "sampling/importance_sampling_ratio/max": 2.8179895877838135, "sampling/importance_sampling_ratio/mean": 0.16381195187568665, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.03925704956055, "sampling/sampling_logp_difference/mean": 0.019513225182890892, "step": 199, "step_time": 237.3188303412171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9971.0, "completions/mean_length": 1874.728271484375, "completions/mean_terminated_length": 1693.0126953125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.123588511099418, "epoch": 0.13351134846461948, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.4850351620411085e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 492384096.0, "reward": 0.7406291365623474, "reward_std": 0.36823615431785583, "rewards/TRLRewardAdapter/mean": 0.7406290769577026, "rewards/TRLRewardAdapter/std": 0.36823615431785583, "sampling/importance_sampling_ratio/max": 2.071995973587036, "sampling/importance_sampling_ratio/mean": 0.19679692387580872, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.0, "sampling/sampling_logp_difference/mean": 0.01636655442416668, "step": 200, "step_time": 357.0852424709592 }, { "epoch": 0.13351134846461948, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.03173912956338862, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9611.652173913044, "eval_completions/mean_length": 2291.2564750339675, "eval_completions/mean_terminated_length": 2038.3417384935462, "eval_completions/min_length": 59.17391304347826, "eval_completions/min_terminated_length": 59.17391304347826, "eval_entropy": 0.15610312119774197, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 492384096.0, "eval_reward": 0.6897373147632765, "eval_reward_std": 0.37708438868108, "eval_rewards/TRLRewardAdapter/mean": 0.6897373432698457, "eval_rewards/TRLRewardAdapter/std": 0.3770843912725863, "eval_runtime": 1455.7283, "eval_samples_per_second": 3.137, "eval_sampling/importance_sampling_ratio/max": 1.5704801289931587, "eval_sampling/importance_sampling_ratio/mean": 0.1495622402947882, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.087380471436873, "eval_sampling/sampling_logp_difference/mean": 0.019887989467900734, "eval_steps_per_second": 0.016, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9961.0, "completions/mean_length": 1646.6531982421875, "completions/mean_terminated_length": 1330.58056640625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.1150330180923144, "epoch": 0.1341789052069426, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00032615161118886086, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 494415955.0, "reward": 0.7077589631080627, "reward_std": 0.4107767641544342, "rewards/TRLRewardAdapter/mean": 0.707758903503418, "rewards/TRLRewardAdapter/std": 0.4107767343521118, "sampling/importance_sampling_ratio/max": 1.364948034286499, "sampling/importance_sampling_ratio/mean": 0.16353939473628998, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.889068603515625, "sampling/sampling_logp_difference/mean": 0.01578902266919613, "step": 201, "step_time": 384.429304581834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9904.0, "completions/max_terminated_length": 9904.0, "completions/mean_length": 1611.185546875, "completions/mean_terminated_length": 1611.185546875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.12759979193409285, "epoch": 0.1348464619492657, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0005095698182848696, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 496428357.0, "reward": 0.7603576183319092, "reward_std": 0.33512499928474426, "rewards/TRLRewardAdapter/mean": 0.7603576183319092, "rewards/TRLRewardAdapter/std": 0.33512499928474426, "sampling/importance_sampling_ratio/max": 1.556559681892395, "sampling/importance_sampling_ratio/mean": 0.17274565994739532, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.5, "sampling/sampling_logp_difference/mean": 0.017253393307328224, "step": 202, "step_time": 191.43635482620448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0572916679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9993.0, "completions/mean_length": 1694.056396484375, "completions/mean_terminated_length": 1189.2750244140625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.12605165938536325, "epoch": 0.13551401869158877, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0001116110935234895, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 498436603.0, "reward": 0.739044725894928, "reward_std": 0.38303279876708984, "rewards/TRLRewardAdapter/mean": 0.739044725894928, "rewards/TRLRewardAdapter/std": 0.38303282856941223, "sampling/importance_sampling_ratio/max": 2.0926554203033447, "sampling/importance_sampling_ratio/mean": 0.20355002582073212, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.25, "sampling/sampling_logp_difference/mean": 0.017055395990610123, "step": 203, "step_time": 394.33618776185904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 7932.0, "completions/mean_length": 1018.9885864257812, "completions/mean_terminated_length": 1009.6235961914062, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.10723814368247986, "epoch": 0.13618157543391188, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011673730772554574, "learning_rate": 5e-06, "loss": -0.0008, "num_tokens": 499874096.0, "reward": 0.8518272638320923, "reward_std": 0.2577032446861267, "rewards/TRLRewardAdapter/mean": 0.8518272638320923, "rewards/TRLRewardAdapter/std": 0.2577032446861267, "sampling/importance_sampling_ratio/max": 2.5152435302734375, "sampling/importance_sampling_ratio/mean": 0.20846128463745117, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.219564437866211, "sampling/sampling_logp_difference/mean": 0.015014168806374073, "step": 204, "step_time": 174.30321881000418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9421.0, "completions/mean_length": 1422.839599609375, "completions/mean_terminated_length": 1413.895751953125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.13823332140843073, "epoch": 0.13684913217623498, "frac_reward_zero_std": 0.0, "grad_norm": 0.000109584544074251, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 501661878.0, "reward": 0.8277803659439087, "reward_std": 0.2692677676677704, "rewards/TRLRewardAdapter/mean": 0.8277803659439087, "rewards/TRLRewardAdapter/std": 0.2692677676677704, "sampling/importance_sampling_ratio/max": 1.7348121404647827, "sampling/importance_sampling_ratio/mean": 0.15951445698738098, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.0, "sampling/sampling_logp_difference/mean": 0.01881805807352066, "step": 205, "step_time": 213.26632172090467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9659.0, "completions/mean_length": 1569.74072265625, "completions/mean_terminated_length": 1463.0284423828125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.12567211935917535, "epoch": 0.1375166889185581, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00017101412852386328, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 503587261.0, "reward": 0.7256647348403931, "reward_std": 0.38579457998275757, "rewards/TRLRewardAdapter/mean": 0.7256647348403931, "rewards/TRLRewardAdapter/std": 0.38579457998275757, "sampling/importance_sampling_ratio/max": 2.679903268814087, "sampling/importance_sampling_ratio/mean": 0.2032599151134491, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.875, "sampling/sampling_logp_difference/mean": 0.017993198707699776, "step": 206, "step_time": 236.95892046194058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9915.0, "completions/mean_length": 2142.81689453125, "completions/mean_terminated_length": 1863.111083984375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.14388840148846307, "epoch": 0.13818424566088117, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00017909333028797616, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 506097101.0, "reward": 0.6916309595108032, "reward_std": 0.3819180727005005, "rewards/TRLRewardAdapter/mean": 0.6916309595108032, "rewards/TRLRewardAdapter/std": 0.3819180727005005, "sampling/importance_sampling_ratio/max": 2.053178548812866, "sampling/importance_sampling_ratio/mean": 0.10323525965213776, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.25, "sampling/sampling_logp_difference/mean": 0.019294891506433487, "step": 207, "step_time": 365.5542562409537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06041666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9988.0, "completions/mean_length": 2458.652099609375, "completions/mean_terminated_length": 1973.731689453125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.1404273957014084, "epoch": 0.13885180240320427, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 9.162050467330734e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 508913919.0, "reward": 0.7137560844421387, "reward_std": 0.3842812478542328, "rewards/TRLRewardAdapter/mean": 0.7137560248374939, "rewards/TRLRewardAdapter/std": 0.3842812478542328, "sampling/importance_sampling_ratio/max": 2.099184513092041, "sampling/importance_sampling_ratio/mean": 0.15702301263809204, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.375, "sampling/sampling_logp_difference/mean": 0.019000068306922913, "step": 208, "step_time": 392.167562186718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333767950535, "completions/max_length": 10000.0, "completions/max_terminated_length": 9829.0, "completions/mean_length": 2127.079345703125, "completions/mean_terminated_length": 2060.920166015625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.1404390757282575, "epoch": 0.13951935914552738, "frac_reward_zero_std": 0.0, "grad_norm": 0.0003599178285833685, "learning_rate": 5e-06, "loss": 0.0011, "num_tokens": 511459403.0, "reward": 0.7380829453468323, "reward_std": 0.3072601854801178, "rewards/TRLRewardAdapter/mean": 0.7380828857421875, "rewards/TRLRewardAdapter/std": 0.3072601854801178, "sampling/importance_sampling_ratio/max": 1.5503085851669312, "sampling/importance_sampling_ratio/mean": 0.06734655797481537, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.634025573730469, "sampling/sampling_logp_difference/mean": 0.018089916557073593, "step": 209, "step_time": 207.88306302891579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9931.0, "completions/mean_length": 2704.837646484375, "completions/mean_terminated_length": 2420.6103515625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.1549710457523664, "epoch": 0.14018691588785046, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 1.1667447114318316e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 514517647.0, "reward": 0.5874022245407104, "reward_std": 0.41766613721847534, "rewards/TRLRewardAdapter/mean": 0.5874022245407104, "rewards/TRLRewardAdapter/std": 0.41766610741615295, "sampling/importance_sampling_ratio/max": 1.2606360912322998, "sampling/importance_sampling_ratio/mean": 0.07208629697561264, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.101531982421875, "sampling/sampling_logp_difference/mean": 0.019920676946640015, "step": 210, "step_time": 344.6699317059247 }, { "epoch": 0.14018691588785046, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0313043472883494, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9674.608695652174, "eval_completions/mean_length": 2305.2312754755435, "eval_completions/mean_terminated_length": 2056.4312691066575, "eval_completions/min_length": 57.26086956521739, "eval_completions/min_terminated_length": 57.26086956521739, "eval_entropy": 0.16556609583937604, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 514517647.0, "eval_reward": 0.6859321879303973, "eval_reward_std": 0.3783438905425694, "eval_rewards/TRLRewardAdapter/mean": 0.6859322086624478, "eval_rewards/TRLRewardAdapter/std": 0.3783438944298288, "eval_runtime": 1470.1988, "eval_samples_per_second": 3.106, "eval_sampling/importance_sampling_ratio/max": 1.3815378157988838, "eval_sampling/importance_sampling_ratio/mean": 0.1519586097287095, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.141765967659328, "eval_sampling/sampling_logp_difference/mean": 0.02098420788736447, "eval_steps_per_second": 0.016, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9941.0, "completions/mean_length": 1236.9896240234375, "completions/mean_terminated_length": 1107.304443359375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.11145749563972156, "epoch": 0.14085447263017356, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00017138792672578, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 516147237.0, "reward": 0.8120801448822021, "reward_std": 0.33006012439727783, "rewards/TRLRewardAdapter/mean": 0.8120801448822021, "rewards/TRLRewardAdapter/std": 0.33006012439727783, "sampling/importance_sampling_ratio/max": 1.7302113771438599, "sampling/importance_sampling_ratio/mean": 0.2704245150089264, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.106597900390625, "sampling/sampling_logp_difference/mean": 0.015840983018279076, "step": 211, "step_time": 239.57413419196382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666883975267, "completions/max_length": 10000.0, "completions/max_terminated_length": 9620.0, "completions/mean_length": 1490.685546875, "completions/mean_terminated_length": 1455.08154296875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.11269605904817581, "epoch": 0.14152202937249667, "frac_reward_zero_std": 0.0, "grad_norm": 0.0003777020931240578, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 517964599.0, "reward": 0.812628448009491, "reward_std": 0.2905798554420471, "rewards/TRLRewardAdapter/mean": 0.812628448009491, "rewards/TRLRewardAdapter/std": 0.2905798852443695, "sampling/importance_sampling_ratio/max": 2.491873264312744, "sampling/importance_sampling_ratio/mean": 0.2042619287967682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.5, "sampling/sampling_logp_difference/mean": 0.015593433752655983, "step": 212, "step_time": 208.86285544093698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9989.0, "completions/mean_length": 2852.81982421875, "completions/mean_terminated_length": 2517.673828125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.15347749988238016, "epoch": 0.14218958611481977, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00039648621656565434, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 521099914.0, "reward": 0.6200533509254456, "reward_std": 0.403478741645813, "rewards/TRLRewardAdapter/mean": 0.6200532913208008, "rewards/TRLRewardAdapter/std": 0.403478741645813, "sampling/importance_sampling_ratio/max": 2.078186511993408, "sampling/importance_sampling_ratio/mean": 0.14535066485404968, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.25, "sampling/sampling_logp_difference/mean": 0.02034621313214302, "step": 213, "step_time": 361.1829838817939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 10000.0, "completions/mean_length": 2685.767822265625, "completions/mean_terminated_length": 2417.21044921875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.16920050730307898, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00013026176542958376, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 524144491.0, "reward": 0.6139687895774841, "reward_std": 0.39491480588912964, "rewards/TRLRewardAdapter/mean": 0.6139687895774841, "rewards/TRLRewardAdapter/std": 0.39491480588912964, "sampling/importance_sampling_ratio/max": 1.8128118515014648, "sampling/importance_sampling_ratio/mean": 0.06771945208311081, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5, "sampling/sampling_logp_difference/mean": 0.021351447328925133, "step": 214, "step_time": 356.83772364701144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9721.0, "completions/mean_length": 1862.4677734375, "completions/mean_terminated_length": 1759.4609375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.15587185323238373, "epoch": 0.14352469959946595, "frac_reward_zero_std": 0.0, "grad_norm": 0.00023993378246663273, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 526386444.0, "reward": 0.694473385810852, "reward_std": 0.38540592789649963, "rewards/TRLRewardAdapter/mean": 0.6944733262062073, "rewards/TRLRewardAdapter/std": 0.38540589809417725, "sampling/importance_sampling_ratio/max": 2.008355140686035, "sampling/importance_sampling_ratio/mean": 0.13144156336784363, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.29045295715332, "sampling/sampling_logp_difference/mean": 0.02014809660613537, "step": 215, "step_time": 317.4295933190733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04583333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9994.0, "completions/mean_length": 2588.86376953125, "completions/mean_terminated_length": 2232.8701171875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.15831729024648666, "epoch": 0.14419225634178906, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 4.787982746632917e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 529348137.0, "reward": 0.6610273718833923, "reward_std": 0.3981195092201233, "rewards/TRLRewardAdapter/mean": 0.6610273122787476, "rewards/TRLRewardAdapter/std": 0.3981195390224457, "sampling/importance_sampling_ratio/max": 1.1917563676834106, "sampling/importance_sampling_ratio/mean": 0.12787184119224548, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.219564437866211, "sampling/sampling_logp_difference/mean": 0.020938202738761902, "step": 216, "step_time": 358.89320123184007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07395833730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9908.0, "completions/mean_length": 3257.210693359375, "completions/mean_terminated_length": 2718.697509765625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.16043553998072943, "epoch": 0.14485981308411214, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 4.949407925365091e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 532945843.0, "reward": 0.6300030946731567, "reward_std": 0.3992212116718292, "rewards/TRLRewardAdapter/mean": 0.630003035068512, "rewards/TRLRewardAdapter/std": 0.39922118186950684, "sampling/importance_sampling_ratio/max": 1.7398160696029663, "sampling/importance_sampling_ratio/mean": 0.044545553624629974, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.5, "sampling/sampling_logp_difference/mean": 0.02058839611709118, "step": 217, "step_time": 380.7431150650373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9886.0, "completions/mean_length": 2809.3408203125, "completions/mean_terminated_length": 2601.250732421875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.1504109725356102, "epoch": 0.14552736982643524, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 9.446984331106686e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 536107162.0, "reward": 0.6429275274276733, "reward_std": 0.39721569418907166, "rewards/TRLRewardAdapter/mean": 0.6429274678230286, "rewards/TRLRewardAdapter/std": 0.39721569418907166, "sampling/importance_sampling_ratio/max": 2.413382053375244, "sampling/importance_sampling_ratio/mean": 0.12508940696716309, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.75, "sampling/sampling_logp_difference/mean": 0.019077060744166374, "step": 218, "step_time": 318.6709590309765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9773.0, "completions/mean_length": 1562.424072265625, "completions/mean_terminated_length": 1518.2481689453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.13090457022190094, "epoch": 0.14619492656875835, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.598160527475557e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 537993265.0, "reward": 0.7685914635658264, "reward_std": 0.31511324644088745, "rewards/TRLRewardAdapter/mean": 0.7685914635658264, "rewards/TRLRewardAdapter/std": 0.31511327624320984, "sampling/importance_sampling_ratio/max": 1.8701825141906738, "sampling/importance_sampling_ratio/mean": 0.1529538929462433, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.5, "sampling/sampling_logp_difference/mean": 0.017173076048493385, "step": 219, "step_time": 181.09795229195151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11041667312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9951.0, "completions/mean_length": 3062.345947265625, "completions/mean_terminated_length": 2201.231689453125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.13970363388458887, "epoch": 0.14686248331108145, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 7.883906459596697e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 541383549.0, "reward": 0.6262157559394836, "reward_std": 0.4038894474506378, "rewards/TRLRewardAdapter/mean": 0.6262157559394836, "rewards/TRLRewardAdapter/std": 0.40388938784599304, "sampling/importance_sampling_ratio/max": 1.8397835493087769, "sampling/importance_sampling_ratio/mean": 0.1236046552658081, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.0, "sampling/sampling_logp_difference/mean": 0.0183850284665823, "step": 220, "step_time": 397.2088272710098 }, { "epoch": 0.14686248331108145, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.02956521668997796, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9706.521739130434, "eval_completions/mean_length": 2450.5897269870925, "eval_completions/mean_terminated_length": 2220.236768639606, "eval_completions/min_length": 58.43478260869565, "eval_completions/min_terminated_length": 58.43478260869565, "eval_entropy": 0.17167627098767654, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 541383549.0, "eval_reward": 0.6837772571522257, "eval_reward_std": 0.3676858868287957, "eval_rewards/TRLRewardAdapter/mean": 0.6837772623352383, "eval_rewards/TRLRewardAdapter/std": 0.367685889420302, "eval_runtime": 1475.0602, "eval_samples_per_second": 3.095, "eval_sampling/importance_sampling_ratio/max": 1.3729540524275408, "eval_sampling/importance_sampling_ratio/mean": 0.12779174000024796, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.398404173229052, "eval_sampling/sampling_logp_difference/mean": 0.02148029183887917, "eval_steps_per_second": 0.016, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9897.0, "completions/mean_length": 2282.711669921875, "completions/mean_terminated_length": 1822.7406005859375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.1478897506992022, "epoch": 0.14753004005340453, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.6405749474563824e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 544002792.0, "reward": 0.6809467077255249, "reward_std": 0.39029183983802795, "rewards/TRLRewardAdapter/mean": 0.6809466481208801, "rewards/TRLRewardAdapter/std": 0.39029181003570557, "sampling/importance_sampling_ratio/max": 2.382572650909424, "sampling/importance_sampling_ratio/mean": 0.13199195265769958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.70234680175781, "sampling/sampling_logp_difference/mean": 0.019250014796853065, "step": 221, "step_time": 380.9774377680151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9796.0, "completions/mean_length": 1986.029296875, "completions/mean_terminated_length": 1771.751953125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.1404216637214025, "epoch": 0.14819759679572764, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.6339682407078155e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 546320164.0, "reward": 0.7136998772621155, "reward_std": 0.3755733072757721, "rewards/TRLRewardAdapter/mean": 0.7136998772621155, "rewards/TRLRewardAdapter/std": 0.3755733370780945, "sampling/importance_sampling_ratio/max": 2.0837202072143555, "sampling/importance_sampling_ratio/mean": 0.17248161137104034, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.25, "sampling/sampling_logp_difference/mean": 0.018414271995425224, "step": 222, "step_time": 247.69018374406733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9998.0, "completions/mean_length": 2786.033447265625, "completions/mean_terminated_length": 2447.755615234375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.15148198356231055, "epoch": 0.14886515353805074, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 1.8986044422027174e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 549451204.0, "reward": 0.5941374897956848, "reward_std": 0.41329607367515564, "rewards/TRLRewardAdapter/mean": 0.59413743019104, "rewards/TRLRewardAdapter/std": 0.41329607367515564, "sampling/importance_sampling_ratio/max": 2.7284903526306152, "sampling/importance_sampling_ratio/mean": 0.09612078219652176, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.0, "sampling/sampling_logp_difference/mean": 0.019390394911170006, "step": 223, "step_time": 302.19259297591634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9953.0, "completions/mean_length": 1839.776123046875, "completions/mean_terminated_length": 1797.0523681640625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.1440351977944374, "epoch": 0.14953271028037382, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.4171813349104643e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 551640813.0, "reward": 0.7723255157470703, "reward_std": 0.31194525957107544, "rewards/TRLRewardAdapter/mean": 0.7723254561424255, "rewards/TRLRewardAdapter/std": 0.31194525957107544, "sampling/importance_sampling_ratio/max": 2.0320050716400146, "sampling/importance_sampling_ratio/mean": 0.13455773890018463, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.387245178222656, "sampling/sampling_logp_difference/mean": 0.018581373617053032, "step": 224, "step_time": 165.72538273700047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06562500447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9993.0, "completions/mean_length": 2452.367919921875, "completions/mean_terminated_length": 1922.2664794921875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.13798446456591287, "epoch": 0.15020026702269693, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 7.391011312375886e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 554451758.0, "reward": 0.7119971513748169, "reward_std": 0.3786100745201111, "rewards/TRLRewardAdapter/mean": 0.7119970917701721, "rewards/TRLRewardAdapter/std": 0.37861010432243347, "sampling/importance_sampling_ratio/max": 2.00235652923584, "sampling/importance_sampling_ratio/mean": 0.1614367812871933, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.0, "sampling/sampling_logp_difference/mean": 0.018367722630500793, "step": 225, "step_time": 348.2170521640219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9958.0, "completions/mean_length": 2459.044921875, "completions/mean_terminated_length": 2207.408935546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.13121585299571356, "epoch": 0.15086782376502003, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.0001058439159593459, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 557287481.0, "reward": 0.6702618598937988, "reward_std": 0.3979147672653198, "rewards/TRLRewardAdapter/mean": 0.670261800289154, "rewards/TRLRewardAdapter/std": 0.39791473746299744, "sampling/importance_sampling_ratio/max": 2.0339035987854004, "sampling/importance_sampling_ratio/mean": 0.12945644557476044, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.0, "sampling/sampling_logp_difference/mean": 0.017412742599844933, "step": 226, "step_time": 356.6741873790743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9860.0, "completions/mean_length": 2692.99609375, "completions/mean_terminated_length": 2561.268310546875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.157304048538208, "epoch": 0.15153538050734314, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.879763477899067e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 560370901.0, "reward": 0.6609500646591187, "reward_std": 0.39863574504852295, "rewards/TRLRewardAdapter/mean": 0.6609500050544739, "rewards/TRLRewardAdapter/std": 0.39863574504852295, "sampling/importance_sampling_ratio/max": 1.6416095495224, "sampling/importance_sampling_ratio/mean": 0.0984826534986496, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.5, "sampling/sampling_logp_difference/mean": 0.020811710506677628, "step": 227, "step_time": 302.9800662531052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9303.0, "completions/mean_length": 1634.42822265625, "completions/mean_terminated_length": 1355.2755126953125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.13333143790562949, "epoch": 0.15220293724966621, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.628030524798522e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 562394352.0, "reward": 0.7843987941741943, "reward_std": 0.33384159207344055, "rewards/TRLRewardAdapter/mean": 0.7843987941741943, "rewards/TRLRewardAdapter/std": 0.33384156227111816, "sampling/importance_sampling_ratio/max": 1.9783769845962524, "sampling/importance_sampling_ratio/mean": 0.18755541741847992, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.940258026123047, "sampling/sampling_logp_difference/mean": 0.01808823272585869, "step": 228, "step_time": 323.4756710929796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333395421505, "completions/max_length": 10000.0, "completions/max_terminated_length": 9920.0, "completions/mean_length": 2471.049072265625, "completions/mean_terminated_length": 2310.8583984375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.12866351505120596, "epoch": 0.15287049399198932, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.00015435420784045808, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 565198495.0, "reward": 0.6045307517051697, "reward_std": 0.4285826086997986, "rewards/TRLRewardAdapter/mean": 0.6045306921005249, "rewards/TRLRewardAdapter/std": 0.42858263850212097, "sampling/importance_sampling_ratio/max": 2.184624671936035, "sampling/importance_sampling_ratio/mean": 0.19405272603034973, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.351531982421875, "sampling/sampling_logp_difference/mean": 0.017281679436564445, "step": 229, "step_time": 307.74209555401467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0677083358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9959.0, "completions/mean_length": 2269.308349609375, "completions/mean_terminated_length": 1707.8614501953125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.14046658078829447, "epoch": 0.15353805073431243, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.6894700423942795e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 567859943.0, "reward": 0.6770106554031372, "reward_std": 0.3992496430873871, "rewards/TRLRewardAdapter/mean": 0.6770105957984924, "rewards/TRLRewardAdapter/std": 0.3992496430873871, "sampling/importance_sampling_ratio/max": 2.5847110748291016, "sampling/importance_sampling_ratio/mean": 0.14862877130508423, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.71358871459961, "sampling/sampling_logp_difference/mean": 0.018156714737415314, "step": 230, "step_time": 378.5915809079306 }, { "epoch": 0.15353805073431243, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.027826086172591084, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9555.130434782608, "eval_completions/mean_length": 2332.269297724185, "eval_completions/mean_terminated_length": 2112.9179846722145, "eval_completions/min_length": 59.56521739130435, "eval_completions/min_terminated_length": 59.56521739130435, "eval_entropy": 0.17030050508354022, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 567859943.0, "eval_reward": 0.6823973215144613, "eval_reward_std": 0.37490142039630725, "eval_rewards/TRLRewardAdapter/mean": 0.6823973292889802, "eval_rewards/TRLRewardAdapter/std": 0.3749014242835667, "eval_runtime": 1457.4005, "eval_samples_per_second": 3.133, "eval_sampling/importance_sampling_ratio/max": 1.5276398010875867, "eval_sampling/importance_sampling_ratio/mean": 0.12767064538986786, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 15.646784160448158, "eval_sampling/sampling_logp_difference/mean": 0.021372626819040463, "eval_steps_per_second": 0.016, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04375000298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9938.0, "completions/mean_length": 1918.2303466796875, "completions/mean_terminated_length": 1548.4759521484375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.1294457664092382, "epoch": 0.1542056074766355, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.090541907899166e-05, "learning_rate": 5e-06, "loss": 0.0007, "num_tokens": 570144324.0, "reward": 0.7421225905418396, "reward_std": 0.35673385858535767, "rewards/TRLRewardAdapter/mean": 0.7421225309371948, "rewards/TRLRewardAdapter/std": 0.35673385858535767, "sampling/importance_sampling_ratio/max": 1.707370400428772, "sampling/importance_sampling_ratio/mean": 0.18030741810798645, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.763919353485107, "sampling/sampling_logp_difference/mean": 0.017356663942337036, "step": 231, "step_time": 394.17981490294915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 9879.0, "completions/mean_length": 1493.0875244140625, "completions/mean_terminated_length": 1439.5848388671875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.13661173234383264, "epoch": 0.1548731642189586, "frac_reward_zero_std": 0.0, "grad_norm": 0.00010316670183645597, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 572001112.0, "reward": 0.7584226727485657, "reward_std": 0.3476516604423523, "rewards/TRLRewardAdapter/mean": 0.7584226727485657, "rewards/TRLRewardAdapter/std": 0.3476516306400299, "sampling/importance_sampling_ratio/max": 1.962481141090393, "sampling/importance_sampling_ratio/mean": 0.14177675545215607, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.719566345214844, "sampling/sampling_logp_difference/mean": 0.0183275043964386, "step": 232, "step_time": 261.716058775899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333395421505, "completions/max_length": 10000.0, "completions/max_terminated_length": 9917.0, "completions/mean_length": 1881.7615966796875, "completions/mean_terminated_length": 1709.032958984375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1358405128121376, "epoch": 0.15554072096128171, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 6.997302092853137e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 574206003.0, "reward": 0.701707124710083, "reward_std": 0.3867097795009613, "rewards/TRLRewardAdapter/mean": 0.7017070651054382, "rewards/TRLRewardAdapter/std": 0.3867098093032837, "sampling/importance_sampling_ratio/max": 2.26735520362854, "sampling/importance_sampling_ratio/mean": 0.12147235870361328, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.539257049560547, "sampling/sampling_logp_difference/mean": 0.017925335094332695, "step": 233, "step_time": 308.04308908921666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0468750037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9956.0, "completions/mean_length": 2317.1396484375, "completions/mean_terminated_length": 1939.2940673828125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.1410179833571116, "epoch": 0.15620827770360482, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.906430876579312e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 576873817.0, "reward": 0.7171027064323425, "reward_std": 0.3845464587211609, "rewards/TRLRewardAdapter/mean": 0.7171027064323425, "rewards/TRLRewardAdapter/std": 0.3845464289188385, "sampling/importance_sampling_ratio/max": 1.7244752645492554, "sampling/importance_sampling_ratio/mean": 0.18729670345783234, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.889067649841309, "sampling/sampling_logp_difference/mean": 0.018392423167824745, "step": 234, "step_time": 326.17607745493297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9951.0, "completions/mean_length": 1466.3209228515625, "completions/mean_terminated_length": 1266.170654296875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.1038264421125253, "epoch": 0.1568758344459279, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.000491750307477849, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 578760493.0, "reward": 0.7268691062927246, "reward_std": 0.4014666676521301, "rewards/TRLRewardAdapter/mean": 0.7268690466880798, "rewards/TRLRewardAdapter/std": 0.4014666676521301, "sampling/importance_sampling_ratio/max": 1.4628689289093018, "sampling/importance_sampling_ratio/mean": 0.20350125432014465, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.939327239990234, "sampling/sampling_logp_difference/mean": 0.014266200363636017, "step": 235, "step_time": 360.1372175948927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9952.0, "completions/mean_length": 1506.6678466796875, "completions/mean_terminated_length": 1316.720947265625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.12165612230698268, "epoch": 0.157543391188251, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0002462880406562505, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 580610702.0, "reward": 0.7220149040222168, "reward_std": 0.3866575360298157, "rewards/TRLRewardAdapter/mean": 0.722014844417572, "rewards/TRLRewardAdapter/std": 0.3866575360298157, "sampling/importance_sampling_ratio/max": 1.9090312719345093, "sampling/importance_sampling_ratio/mean": 0.17853078246116638, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.205451965332031, "sampling/sampling_logp_difference/mean": 0.01616903580725193, "step": 236, "step_time": 326.8476574450033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 10000.0, "completions/max_terminated_length": 9848.0, "completions/mean_length": 2172.20947265625, "completions/mean_terminated_length": 2089.8115234375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.13894809037446976, "epoch": 0.1582109479305741, "frac_reward_zero_std": 0.0, "grad_norm": 8.32602629927669e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 583133239.0, "reward": 0.7388115525245667, "reward_std": 0.34556108713150024, "rewards/TRLRewardAdapter/mean": 0.7388114929199219, "rewards/TRLRewardAdapter/std": 0.34556105732917786, "sampling/importance_sampling_ratio/max": 2.5705149173736572, "sampling/importance_sampling_ratio/mean": 0.09861212968826294, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.5, "sampling/sampling_logp_difference/mean": 0.017519807443022728, "step": 237, "step_time": 292.71636863588355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 8243.0, "completions/mean_length": 1946.3167724609375, "completions/mean_terminated_length": 1801.12841796875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.16308429092168808, "epoch": 0.1588785046728972, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0004615762890250965, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 585412487.0, "reward": 0.6944828629493713, "reward_std": 0.378595769405365, "rewards/TRLRewardAdapter/mean": 0.6944828033447266, "rewards/TRLRewardAdapter/std": 0.378595769405365, "sampling/importance_sampling_ratio/max": 2.243154525756836, "sampling/importance_sampling_ratio/mean": 0.1913086473941803, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.763919830322266, "sampling/sampling_logp_difference/mean": 0.020851675420999527, "step": 238, "step_time": 285.48189668089617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250002793967724, "completions/max_length": 10000.0, "completions/max_terminated_length": 9686.0, "completions/mean_length": 2441.6376953125, "completions/mean_terminated_length": 2417.943603515625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1483694687485695, "epoch": 0.1595460614152203, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.7822864213889852e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 588192427.0, "reward": 0.6832602620124817, "reward_std": 0.3617902994155884, "rewards/TRLRewardAdapter/mean": 0.6832602024078369, "rewards/TRLRewardAdapter/std": 0.361790269613266, "sampling/importance_sampling_ratio/max": 1.610601544380188, "sampling/importance_sampling_ratio/mean": 0.08532974123954773, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.705451965332031, "sampling/sampling_logp_difference/mean": 0.019107386469841003, "step": 239, "step_time": 242.03955541201867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9845.0, "completions/mean_length": 1740.38232421875, "completions/mean_terminated_length": 1609.2772216796875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.1324122796456019, "epoch": 0.1602136181575434, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.3345297377749545e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 590316730.0, "reward": 0.7608112096786499, "reward_std": 0.328910231590271, "rewards/TRLRewardAdapter/mean": 0.7608111500740051, "rewards/TRLRewardAdapter/std": 0.3289102613925934, "sampling/importance_sampling_ratio/max": 2.1496741771698, "sampling/importance_sampling_ratio/mean": 0.11628293991088867, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.5, "sampling/sampling_logp_difference/mean": 0.017289597541093826, "step": 240, "step_time": 304.29483910510316 }, { "epoch": 0.1602136181575434, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0254347819187071, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9647.91304347826, "eval_completions/mean_length": 2316.6708241338315, "eval_completions/mean_terminated_length": 2116.603711998981, "eval_completions/min_length": 59.78260869565217, "eval_completions/min_terminated_length": 59.78260869565217, "eval_entropy": 0.1675577215526415, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 590316730.0, "eval_reward": 0.6879674170328223, "eval_reward_std": 0.37240002336709394, "eval_rewards/TRLRewardAdapter/mean": 0.687967422215835, "eval_rewards/TRLRewardAdapter/std": 0.3724000259586003, "eval_runtime": 1447.9828, "eval_samples_per_second": 3.153, "eval_sampling/importance_sampling_ratio/max": 1.494857316431792, "eval_sampling/importance_sampling_ratio/mean": 0.13234469683273978, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 8.095063883325327, "eval_sampling/sampling_logp_difference/mean": 0.021020554330037987, "eval_steps_per_second": 0.016, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9555.0, "completions/mean_length": 1478.8021240234375, "completions/mean_terminated_length": 1370.938720703125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.15163111438353857, "epoch": 0.16088117489986647, "frac_reward_zero_std": 0.0, "grad_norm": 7.402080916284027e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 592130460.0, "reward": 0.8188214302062988, "reward_std": 0.2705446779727936, "rewards/TRLRewardAdapter/mean": 0.818821370601654, "rewards/TRLRewardAdapter/std": 0.2705446779727936, "sampling/importance_sampling_ratio/max": 2.1010048389434814, "sampling/importance_sampling_ratio/mean": 0.13743464648723602, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.0, "sampling/sampling_logp_difference/mean": 0.019511863589286804, "step": 241, "step_time": 288.7785897270078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333395421505, "completions/max_length": 10000.0, "completions/max_terminated_length": 9929.0, "completions/mean_length": 1699.6917724609375, "completions/mean_terminated_length": 1523.08935546875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.11945017303029697, "epoch": 0.16154873164218958, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0006271125485570422, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 594181972.0, "reward": 0.7553777098655701, "reward_std": 0.3601926863193512, "rewards/TRLRewardAdapter/mean": 0.7553776502609253, "rewards/TRLRewardAdapter/std": 0.3601926863193512, "sampling/importance_sampling_ratio/max": 1.9153199195861816, "sampling/importance_sampling_ratio/mean": 0.12550663948059082, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.75, "sampling/sampling_logp_difference/mean": 0.01612902246415615, "step": 242, "step_time": 374.855492920964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9390.0, "completions/mean_length": 1418.5084228515625, "completions/mean_terminated_length": 1151.200927734375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.133168692390124, "epoch": 0.16221628838451269, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.102780074863889e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 595985148.0, "reward": 0.8154519200325012, "reward_std": 0.3032754063606262, "rewards/TRLRewardAdapter/mean": 0.8154518604278564, "rewards/TRLRewardAdapter/std": 0.3032754063606262, "sampling/importance_sampling_ratio/max": 1.8609968423843384, "sampling/importance_sampling_ratio/mean": 0.12431769073009491, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.0, "sampling/sampling_logp_difference/mean": 0.01771688461303711, "step": 243, "step_time": 383.1792370810872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666883975267, "completions/max_length": 10000.0, "completions/max_terminated_length": 9907.0, "completions/mean_length": 1836.517822265625, "completions/mean_terminated_length": 1802.36083984375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.15853641430536905, "epoch": 0.1628838451268358, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.242002925683786e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 598268365.0, "reward": 0.7584381699562073, "reward_std": 0.33371487259864807, "rewards/TRLRewardAdapter/mean": 0.7584381103515625, "rewards/TRLRewardAdapter/std": 0.33371487259864807, "sampling/importance_sampling_ratio/max": 1.20493483543396, "sampling/importance_sampling_ratio/mean": 0.0573461651802063, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.88906717300415, "sampling/sampling_logp_difference/mean": 0.02024095319211483, "step": 244, "step_time": 226.73080047091935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9994.0, "completions/mean_length": 1560.5980224609375, "completions/mean_terminated_length": 1435.701904296875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.13162552813688913, "epoch": 0.16355140186915887, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 1.3562784873169292e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 600243371.0, "reward": 0.7058282494544983, "reward_std": 0.39950576424598694, "rewards/TRLRewardAdapter/mean": 0.7058282494544983, "rewards/TRLRewardAdapter/std": 0.39950576424598694, "sampling/importance_sampling_ratio/max": 2.2477633953094482, "sampling/importance_sampling_ratio/mean": 0.14909403026103973, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 59.0, "sampling/sampling_logp_difference/mean": 0.018379291519522667, "step": 245, "step_time": 341.50019613606855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9863.0, "completions/mean_length": 1924.0230712890625, "completions/mean_terminated_length": 1743.4100341796875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.14533623307943344, "epoch": 0.16421895861148197, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0004075338832277061, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 602522241.0, "reward": 0.7207520604133606, "reward_std": 0.37726902961730957, "rewards/TRLRewardAdapter/mean": 0.7207520008087158, "rewards/TRLRewardAdapter/std": 0.3772689998149872, "sampling/importance_sampling_ratio/max": 2.7370214462280273, "sampling/importance_sampling_ratio/mean": 0.1630059778690338, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.955453872680664, "sampling/sampling_logp_difference/mean": 0.018873095512390137, "step": 246, "step_time": 291.6803758300375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9961.0, "completions/mean_length": 2592.31787109375, "completions/mean_terminated_length": 2426.650634765625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.13370201488335928, "epoch": 0.16488651535380508, "frac_reward_zero_std": 0.0, "grad_norm": 6.977379037835883e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 605455826.0, "reward": 0.7250389456748962, "reward_std": 0.32589083909988403, "rewards/TRLRewardAdapter/mean": 0.7250388860702515, "rewards/TRLRewardAdapter/std": 0.32589083909988403, "sampling/importance_sampling_ratio/max": 1.4049186706542969, "sampling/importance_sampling_ratio/mean": 0.07641161233186722, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.5, "sampling/sampling_logp_difference/mean": 0.017722513526678085, "step": 247, "step_time": 354.41592712001875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9782.0, "completions/mean_length": 2671.35546875, "completions/mean_terminated_length": 2234.548583984375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.12907550980647406, "epoch": 0.16555407209612816, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.2284772120772575e-05, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 608467047.0, "reward": 0.662641167640686, "reward_std": 0.3918914794921875, "rewards/TRLRewardAdapter/mean": 0.662641167640686, "rewards/TRLRewardAdapter/std": 0.3918914794921875, "sampling/importance_sampling_ratio/max": 2.258040428161621, "sampling/importance_sampling_ratio/mean": 0.1627049446105957, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.5, "sampling/sampling_logp_difference/mean": 0.01702229119837284, "step": 248, "step_time": 394.91010094818193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09583333879709244, "completions/max_length": 10000.0, "completions/max_terminated_length": 9956.0, "completions/mean_length": 3476.015869140625, "completions/mean_terminated_length": 2784.533447265625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1371245558063189, "epoch": 0.16622162883845126, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.3407312309747806e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 612298006.0, "reward": 0.5984952449798584, "reward_std": 0.4162593483924866, "rewards/TRLRewardAdapter/mean": 0.5984952449798584, "rewards/TRLRewardAdapter/std": 0.4162593483924866, "sampling/importance_sampling_ratio/max": 1.5052777528762817, "sampling/importance_sampling_ratio/mean": 0.07441232353448868, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 66.2480697631836, "sampling/sampling_logp_difference/mean": 0.017415976151823997, "step": 249, "step_time": 393.8503445899114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 10000.0, "completions/mean_length": 1760.3177490234375, "completions/mean_terminated_length": 1558.06298828125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.11295569315552711, "epoch": 0.16688918558077437, "frac_reward_zero_std": 0.0, "grad_norm": 1.4537311568772472e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 614471911.0, "reward": 0.8099569082260132, "reward_std": 0.299437016248703, "rewards/TRLRewardAdapter/mean": 0.8099568486213684, "rewards/TRLRewardAdapter/std": 0.2994369864463806, "sampling/importance_sampling_ratio/max": 1.6138883829116821, "sampling/importance_sampling_ratio/mean": 0.14735673367977142, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.5, "sampling/sampling_logp_difference/mean": 0.015159010887145996, "step": 250, "step_time": 320.54451878508553 }, { "epoch": 0.16688918558077437, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.028695651249069233, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9646.434782608696, "eval_completions/mean_length": 2365.37452233356, "eval_completions/mean_terminated_length": 2139.8463028617525, "eval_completions/min_length": 67.6086956521739, "eval_completions/min_terminated_length": 67.6086956521739, "eval_entropy": 0.15418669257474982, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 614471911.0, "eval_reward": 0.6888729178387186, "eval_reward_std": 0.3701730694459832, "eval_rewards/TRLRewardAdapter/mean": 0.68887293079625, "eval_rewards/TRLRewardAdapter/std": 0.3701730694459832, "eval_runtime": 1457.5468, "eval_samples_per_second": 3.133, "eval_sampling/importance_sampling_ratio/max": 1.4258308281069216, "eval_sampling/importance_sampling_ratio/mean": 0.12158221168362576, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 13.934375617815101, "eval_sampling/sampling_logp_difference/mean": 0.019598346608488457, "eval_steps_per_second": 0.016, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9946.0, "completions/mean_length": 2864.156494140625, "completions/mean_terminated_length": 2438.841064453125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.14145439863204956, "epoch": 0.16755674232309747, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.327984442424555e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 617655709.0, "reward": 0.6307635307312012, "reward_std": 0.39345794916152954, "rewards/TRLRewardAdapter/mean": 0.6307634711265564, "rewards/TRLRewardAdapter/std": 0.39345794916152954, "sampling/importance_sampling_ratio/max": 1.7987382411956787, "sampling/importance_sampling_ratio/mean": 0.10100911557674408, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.75, "sampling/sampling_logp_difference/mean": 0.018307561054825783, "step": 251, "step_time": 380.26561016915366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9896.0, "completions/mean_length": 2240.4990234375, "completions/mean_terminated_length": 1972.929931640625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.12486602614323299, "epoch": 0.16822429906542055, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.0937928151145613e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 620272540.0, "reward": 0.7508034110069275, "reward_std": 0.3305306136608124, "rewards/TRLRewardAdapter/mean": 0.7508033514022827, "rewards/TRLRewardAdapter/std": 0.3305306136608124, "sampling/importance_sampling_ratio/max": 1.9861724376678467, "sampling/importance_sampling_ratio/mean": 0.12150933593511581, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.5, "sampling/sampling_logp_difference/mean": 0.016387343406677246, "step": 252, "step_time": 368.14781181898434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9978.0, "completions/mean_length": 1869.7657470703125, "completions/mean_terminated_length": 1580.3397216796875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.11977737148602803, "epoch": 0.16889185580774366, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00017927894423684716, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 622512731.0, "reward": 0.7525489926338196, "reward_std": 0.36158207058906555, "rewards/TRLRewardAdapter/mean": 0.7525489926338196, "rewards/TRLRewardAdapter/std": 0.36158207058906555, "sampling/importance_sampling_ratio/max": 2.388068675994873, "sampling/importance_sampling_ratio/mean": 0.1553247570991516, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.513919830322266, "sampling/sampling_logp_difference/mean": 0.015971872955560684, "step": 253, "step_time": 345.56787755177356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9952.0, "completions/mean_length": 2030.4219970703125, "completions/mean_terminated_length": 1929.541015625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.12328075245022774, "epoch": 0.16955941255006676, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.4902752702821146e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 624886352.0, "reward": 0.7136213779449463, "reward_std": 0.3750477433204651, "rewards/TRLRewardAdapter/mean": 0.7136213183403015, "rewards/TRLRewardAdapter/std": 0.3750477433204651, "sampling/importance_sampling_ratio/max": 2.0539515018463135, "sampling/importance_sampling_ratio/mean": 0.096359983086586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.889068603515625, "sampling/sampling_logp_difference/mean": 0.016352171078324318, "step": 254, "step_time": 239.81884510279633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9972.0, "completions/mean_length": 1621.3333740234375, "completions/mean_terminated_length": 1559.7900390625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.11992477873961131, "epoch": 0.17022696929238984, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.8443882254938697e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 626831600.0, "reward": 0.7867948412895203, "reward_std": 0.3225991725921631, "rewards/TRLRewardAdapter/mean": 0.7867947816848755, "rewards/TRLRewardAdapter/std": 0.3225991725921631, "sampling/importance_sampling_ratio/max": 2.2261083126068115, "sampling/importance_sampling_ratio/mean": 0.11120593547821045, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.139068603515625, "sampling/sampling_logp_difference/mean": 0.015945816412568092, "step": 255, "step_time": 212.3253509060014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05312500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9975.0, "completions/mean_length": 2407.190673828125, "completions/mean_terminated_length": 1981.19140625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.12314822897315025, "epoch": 0.17089452603471295, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.998939641121485e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 629610727.0, "reward": 0.6973899006843567, "reward_std": 0.3874163329601288, "rewards/TRLRewardAdapter/mean": 0.6973898410797119, "rewards/TRLRewardAdapter/std": 0.3874163329601288, "sampling/importance_sampling_ratio/max": 2.0515389442443848, "sampling/importance_sampling_ratio/mean": 0.10690023750066757, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.955452919006348, "sampling/sampling_logp_difference/mean": 0.016299400478601456, "step": 256, "step_time": 323.02543275419157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06354167312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9985.0, "completions/mean_length": 3000.557373046875, "completions/mean_terminated_length": 2525.622802734375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1455593059460322, "epoch": 0.17156208277703605, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.000140514955425879, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 632971326.0, "reward": 0.6440715193748474, "reward_std": 0.36810481548309326, "rewards/TRLRewardAdapter/mean": 0.6440715193748474, "rewards/TRLRewardAdapter/std": 0.36810481548309326, "sampling/importance_sampling_ratio/max": 1.325792670249939, "sampling/importance_sampling_ratio/mean": 0.052727796137332916, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.78516960144043, "sampling/sampling_logp_difference/mean": 0.019238924607634544, "step": 257, "step_time": 365.9848408391699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9943.0, "completions/mean_length": 2248.438720703125, "completions/mean_terminated_length": 2032.65625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.13047201931476593, "epoch": 0.17222963951935916, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.5728367260255444e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 635619139.0, "reward": 0.6354899406433105, "reward_std": 0.4124302268028259, "rewards/TRLRewardAdapter/mean": 0.6354898810386658, "rewards/TRLRewardAdapter/std": 0.41243019700050354, "sampling/importance_sampling_ratio/max": 2.2882955074310303, "sampling/importance_sampling_ratio/mean": 0.1258556842803955, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.75, "sampling/sampling_logp_difference/mean": 0.01728152111172676, "step": 258, "step_time": 370.60041607299354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9750.0, "completions/mean_length": 2388.1064453125, "completions/mean_terminated_length": 2159.4228515625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.11757166807850201, "epoch": 0.17289719626168223, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 2.274062367914379e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 638370313.0, "reward": 0.702405571937561, "reward_std": 0.36202505230903625, "rewards/TRLRewardAdapter/mean": 0.702405571937561, "rewards/TRLRewardAdapter/std": 0.36202505230903625, "sampling/importance_sampling_ratio/max": 1.911769151687622, "sampling/importance_sampling_ratio/mean": 0.13218224048614502, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.24807071685791, "sampling/sampling_logp_difference/mean": 0.015737012028694153, "step": 259, "step_time": 302.40132703376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666883975267, "completions/max_length": 10000.0, "completions/max_terminated_length": 9768.0, "completions/mean_length": 1536.1417236328125, "completions/mean_terminated_length": 1500.72802734375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.11911117906371753, "epoch": 0.17356475300400534, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 9.238942669875025e-05, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 640258481.0, "reward": 0.7950014472007751, "reward_std": 0.29436561465263367, "rewards/TRLRewardAdapter/mean": 0.7950013875961304, "rewards/TRLRewardAdapter/std": 0.29436561465263367, "sampling/importance_sampling_ratio/max": 2.162294387817383, "sampling/importance_sampling_ratio/mean": 0.12432336807250977, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.134025573730469, "sampling/sampling_logp_difference/mean": 0.01611616276204586, "step": 260, "step_time": 206.40729457803536 }, { "epoch": 0.17356475300400534, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.021304347228420818, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9528.130434782608, "eval_completions/mean_length": 2172.7008003566575, "eval_completions/mean_terminated_length": 2002.2277142068615, "eval_completions/min_length": 66.69565217391305, "eval_completions/min_terminated_length": 66.69565217391305, "eval_entropy": 0.14900501914646314, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 640258481.0, "eval_reward": 0.6983263544414354, "eval_reward_std": 0.3690158297186312, "eval_rewards/TRLRewardAdapter/mean": 0.6983263725819795, "eval_rewards/TRLRewardAdapter/std": 0.36901583490164386, "eval_runtime": 1444.575, "eval_samples_per_second": 3.161, "eval_sampling/importance_sampling_ratio/max": 1.320948354575945, "eval_sampling/importance_sampling_ratio/mean": 0.12717954036982163, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.012294727823008, "eval_sampling/sampling_logp_difference/mean": 0.01910772717193417, "eval_steps_per_second": 0.016, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 9965.0, "completions/mean_length": 1740.065673828125, "completions/mean_terminated_length": 1688.1163330078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.12752499307195345, "epoch": 0.17423230974632845, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 2.6556685156782165e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 642386768.0, "reward": 0.688409686088562, "reward_std": 0.3850640654563904, "rewards/TRLRewardAdapter/mean": 0.688409686088562, "rewards/TRLRewardAdapter/std": 0.3850640654563904, "sampling/importance_sampling_ratio/max": 1.1817439794540405, "sampling/importance_sampling_ratio/mean": 0.08567533642053604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.5, "sampling/sampling_logp_difference/mean": 0.01745976321399212, "step": 261, "step_time": 177.1135061280802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05520833656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9854.0, "completions/mean_length": 2775.846923828125, "completions/mean_terminated_length": 2353.7080078125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.12017121662696202, "epoch": 0.17489986648865152, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 1.3567151466209598e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 645491709.0, "reward": 0.6261187195777893, "reward_std": 0.40653324127197266, "rewards/TRLRewardAdapter/mean": 0.6261186599731445, "rewards/TRLRewardAdapter/std": 0.40653324127197266, "sampling/importance_sampling_ratio/max": 1.0348937511444092, "sampling/importance_sampling_ratio/mean": 0.06614522635936737, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.137245178222656, "sampling/sampling_logp_difference/mean": 0.016357218846678734, "step": 262, "step_time": 308.0113607061794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9606.0, "completions/mean_length": 1827.1636962890625, "completions/mean_terminated_length": 1818.641357421875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.12126223370432854, "epoch": 0.17556742323097463, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.585714707693353e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 647709626.0, "reward": 0.755581259727478, "reward_std": 0.33362525701522827, "rewards/TRLRewardAdapter/mean": 0.7555812001228333, "rewards/TRLRewardAdapter/std": 0.3336252272129059, "sampling/importance_sampling_ratio/max": 2.0581912994384766, "sampling/importance_sampling_ratio/mean": 0.12893173098564148, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.182637691497803, "sampling/sampling_logp_difference/mean": 0.016820687800645828, "step": 263, "step_time": 223.76363374001812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9852.0, "completions/mean_length": 2119.419921875, "completions/mean_terminated_length": 2102.9677734375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.1192654234667619, "epoch": 0.17623497997329773, "frac_reward_zero_std": 0.0, "grad_norm": 0.00014235522996687146, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 650209069.0, "reward": 0.6907052993774414, "reward_std": 0.3776255249977112, "rewards/TRLRewardAdapter/mean": 0.6907052397727966, "rewards/TRLRewardAdapter/std": 0.3776255249977112, "sampling/importance_sampling_ratio/max": 2.215994119644165, "sampling/importance_sampling_ratio/mean": 0.12776921689510345, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.851531982421875, "sampling/sampling_logp_difference/mean": 0.016013458371162415, "step": 264, "step_time": 260.23448799701873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9901.0, "completions/mean_length": 1864.3115234375, "completions/mean_terminated_length": 1529.0010986328125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.10615233207742374, "epoch": 0.17690253671562084, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.237556756061739e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 652437816.0, "reward": 0.751419186592102, "reward_std": 0.37443405389785767, "rewards/TRLRewardAdapter/mean": 0.7514191269874573, "rewards/TRLRewardAdapter/std": 0.37443405389785767, "sampling/importance_sampling_ratio/max": 1.8301793336868286, "sampling/importance_sampling_ratio/mean": 0.18975035846233368, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.496212005615234, "sampling/sampling_logp_difference/mean": 0.015240705572068691, "step": 265, "step_time": 368.02439479192253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04583333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9997.0, "completions/mean_length": 2073.485595703125, "completions/mean_terminated_length": 1692.73583984375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.12325968345006307, "epoch": 0.17757009345794392, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.0001701612755884356, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 654887050.0, "reward": 0.7094910144805908, "reward_std": 0.3944751024246216, "rewards/TRLRewardAdapter/mean": 0.709490954875946, "rewards/TRLRewardAdapter/std": 0.39447513222694397, "sampling/importance_sampling_ratio/max": 1.710984468460083, "sampling/importance_sampling_ratio/mean": 0.1605999767780304, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.6875, "sampling/sampling_logp_difference/mean": 0.016771679744124413, "step": 266, "step_time": 392.9392851489829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9594.0, "completions/mean_length": 1871.3115234375, "completions/mean_terminated_length": 1715.986328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.11923009902238846, "epoch": 0.17823765020026702, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0004533508799414859, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 657167989.0, "reward": 0.7110332250595093, "reward_std": 0.3810991942882538, "rewards/TRLRewardAdapter/mean": 0.7110331654548645, "rewards/TRLRewardAdapter/std": 0.3810991644859314, "sampling/importance_sampling_ratio/max": 1.4911428689956665, "sampling/importance_sampling_ratio/mean": 0.1441706269979477, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.25, "sampling/sampling_logp_difference/mean": 0.016382744535803795, "step": 267, "step_time": 253.89419544814155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9928.0, "completions/mean_length": 1657.7823486328125, "completions/mean_terminated_length": 1525.3662109375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.13070925821860632, "epoch": 0.17890520694259013, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 7.935323023872495e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 659222500.0, "reward": 0.7374539375305176, "reward_std": 0.3711867034435272, "rewards/TRLRewardAdapter/mean": 0.7374538779258728, "rewards/TRLRewardAdapter/std": 0.3711867034435272, "sampling/importance_sampling_ratio/max": 1.9248546361923218, "sampling/importance_sampling_ratio/mean": 0.11583695560693741, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.0, "sampling/sampling_logp_difference/mean": 0.017782563343644142, "step": 268, "step_time": 212.1298906300217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9824.0, "completions/mean_length": 2638.19482421875, "completions/mean_terminated_length": 2433.26220703125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.12146906306346257, "epoch": 0.1795727636849132, "frac_reward_zero_std": 0.23333334922790527, "grad_norm": 2.0876117399730177e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 662199231.0, "reward": 0.5706943869590759, "reward_std": 0.4312523305416107, "rewards/TRLRewardAdapter/mean": 0.5706943273544312, "rewards/TRLRewardAdapter/std": 0.4312523305416107, "sampling/importance_sampling_ratio/max": 1.5165576934814453, "sampling/importance_sampling_ratio/mean": 0.12394453585147858, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.5, "sampling/sampling_logp_difference/mean": 0.016380680724978447, "step": 269, "step_time": 251.66130811709445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 10000.0, "completions/mean_length": 1509.0313720703125, "completions/mean_terminated_length": 1374.2540283203125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.1005406342446804, "epoch": 0.1802403204272363, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 9.608616458127262e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 664075261.0, "reward": 0.7976763248443604, "reward_std": 0.30779212713241577, "rewards/TRLRewardAdapter/mean": 0.7976762652397156, "rewards/TRLRewardAdapter/std": 0.30779212713241577, "sampling/importance_sampling_ratio/max": 2.2124838829040527, "sampling/importance_sampling_ratio/mean": 0.1567261517047882, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.70545196533203, "sampling/sampling_logp_difference/mean": 0.014291415922343731, "step": 270, "step_time": 226.1090277651092 }, { "epoch": 0.1802403204272363, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.018043477817074112, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9313.478260869566, "eval_completions/mean_length": 1973.1914699388587, "eval_completions/mean_terminated_length": 1825.6540686565897, "eval_completions/min_length": 60.0, "eval_completions/min_terminated_length": 60.0, "eval_entropy": 0.13930678723946863, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 664075261.0, "eval_reward": 0.7040041944255, "eval_reward_std": 0.37509275648904883, "eval_rewards/TRLRewardAdapter/mean": 0.7040042073830314, "eval_rewards/TRLRewardAdapter/std": 0.3750927590805551, "eval_runtime": 1422.621, "eval_samples_per_second": 3.21, "eval_sampling/importance_sampling_ratio/max": 1.6197976392248403, "eval_sampling/importance_sampling_ratio/mean": 0.16155111336189767, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.02320166774418, "eval_sampling/sampling_logp_difference/mean": 0.01846125470879285, "eval_steps_per_second": 0.016, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 9807.0, "completions/mean_length": 1583.482421875, "completions/mean_terminated_length": 1530.5482177734375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1140472690264384, "epoch": 0.18090787716955942, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0003388270157470243, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 666030668.0, "reward": 0.7345895767211914, "reward_std": 0.37124931812286377, "rewards/TRLRewardAdapter/mean": 0.7345895171165466, "rewards/TRLRewardAdapter/std": 0.3712492883205414, "sampling/importance_sampling_ratio/max": 2.733048915863037, "sampling/importance_sampling_ratio/mean": 0.17211182415485382, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.75, "sampling/sampling_logp_difference/mean": 0.016054624691605568, "step": 271, "step_time": 220.71064519602805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9829.0, "completions/mean_length": 1569.0302734375, "completions/mean_terminated_length": 1444.2589111328125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.1048960176606973, "epoch": 0.18157543391188252, "frac_reward_zero_std": 0.0, "grad_norm": 8.90700914371837e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 668034505.0, "reward": 0.7610125541687012, "reward_std": 0.343519926071167, "rewards/TRLRewardAdapter/mean": 0.7610124945640564, "rewards/TRLRewardAdapter/std": 0.343519926071167, "sampling/importance_sampling_ratio/max": 2.4661293029785156, "sampling/importance_sampling_ratio/mean": 0.16516920924186707, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.196102142333984, "sampling/sampling_logp_difference/mean": 0.015064316801726818, "step": 272, "step_time": 355.29698444821406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0677083358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9993.0, "completions/mean_length": 2466.510498046875, "completions/mean_terminated_length": 1919.3853759765625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.11910906185706456, "epoch": 0.1822429906542056, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 0.00031160616661170823, "learning_rate": 5e-06, "loss": 0.0109, "num_tokens": 670838707.0, "reward": 0.6135138273239136, "reward_std": 0.4481686055660248, "rewards/TRLRewardAdapter/mean": 0.6135138273239136, "rewards/TRLRewardAdapter/std": 0.4481685757637024, "sampling/importance_sampling_ratio/max": 1.5238147974014282, "sampling/importance_sampling_ratio/mean": 0.20410621166229248, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.5, "sampling/sampling_logp_difference/mean": 0.015928512439131737, "step": 273, "step_time": 331.96126990485936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 9860.0, "completions/mean_length": 1299.1865234375, "completions/mean_terminated_length": 1018.5150756835938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.10251324375470479, "epoch": 0.1829105473965287, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.0003949104657117351, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 672535622.0, "reward": 0.7556927800178528, "reward_std": 0.38567665219306946, "rewards/TRLRewardAdapter/mean": 0.755692720413208, "rewards/TRLRewardAdapter/std": 0.38567665219306946, "sampling/importance_sampling_ratio/max": 1.653946042060852, "sampling/importance_sampling_ratio/mean": 0.22351744771003723, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.362421035766602, "sampling/sampling_logp_difference/mean": 0.01556961890310049, "step": 274, "step_time": 341.20451758406125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9304.0, "completions/mean_length": 1219.267822265625, "completions/mean_terminated_length": 1210.111572265625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.09406319012244542, "epoch": 0.1835781041388518, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00025124289062071465, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 674126119.0, "reward": 0.77979975938797, "reward_std": 0.3504678010940552, "rewards/TRLRewardAdapter/mean": 0.7797996997833252, "rewards/TRLRewardAdapter/std": 0.3504678010940552, "sampling/importance_sampling_ratio/max": 1.4780569076538086, "sampling/importance_sampling_ratio/mean": 0.18377763032913208, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.8624210357666, "sampling/sampling_logp_difference/mean": 0.014012235216796398, "step": 275, "step_time": 217.71556278783828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07187500596046448, "completions/max_length": 10000.0, "completions/max_terminated_length": 9958.0, "completions/mean_length": 3110.2158203125, "completions/mean_terminated_length": 2576.663330078125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.13455243905385336, "epoch": 0.1842456608811749, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 3.3102835186530424e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 677542326.0, "reward": 0.5664615631103516, "reward_std": 0.42113104462623596, "rewards/TRLRewardAdapter/mean": 0.5664615631103516, "rewards/TRLRewardAdapter/std": 0.42113104462623596, "sampling/importance_sampling_ratio/max": 1.8934800624847412, "sampling/importance_sampling_ratio/mean": 0.09749589115381241, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.50093460083008, "sampling/sampling_logp_difference/mean": 0.01801121048629284, "step": 276, "step_time": 279.7367320859339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 9970.0, "completions/mean_length": 1834.572021484375, "completions/mean_terminated_length": 1783.2169189453125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.1313824007908503, "epoch": 0.184913217623498, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0001887082308026208, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 679776923.0, "reward": 0.686741054058075, "reward_std": 0.390485018491745, "rewards/TRLRewardAdapter/mean": 0.6867409944534302, "rewards/TRLRewardAdapter/std": 0.390485018491745, "sampling/importance_sampling_ratio/max": 1.5730290412902832, "sampling/importance_sampling_ratio/mean": 0.11409027874469757, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.95234680175781, "sampling/sampling_logp_difference/mean": 0.01841125264763832, "step": 277, "step_time": 268.59276663919445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9719.0, "completions/mean_length": 2045.1949462890625, "completions/mean_terminated_length": 1952.9893798828125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.13251476486523947, "epoch": 0.1855807743658211, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00013683387198090176, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 682283478.0, "reward": 0.7076067924499512, "reward_std": 0.38631945848464966, "rewards/TRLRewardAdapter/mean": 0.7076067328453064, "rewards/TRLRewardAdapter/std": 0.38631945848464966, "sampling/importance_sampling_ratio/max": 2.094226598739624, "sampling/importance_sampling_ratio/mean": 0.17003469169139862, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.0, "sampling/sampling_logp_difference/mean": 0.018464000895619392, "step": 278, "step_time": 222.7802886220161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9981.0, "completions/mean_length": 1732.11669921875, "completions/mean_terminated_length": 1419.27783203125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.08776258304715157, "epoch": 0.1862483311081442, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011247203268805529, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 684364262.0, "reward": 0.7471456527709961, "reward_std": 0.36778318881988525, "rewards/TRLRewardAdapter/mean": 0.7471455931663513, "rewards/TRLRewardAdapter/std": 0.36778315901756287, "sampling/importance_sampling_ratio/max": 2.367309331893921, "sampling/importance_sampling_ratio/mean": 0.20460394024848938, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.345741271972656, "sampling/sampling_logp_difference/mean": 0.012819704599678516, "step": 279, "step_time": 318.77565642807167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9557.0, "completions/mean_length": 1725.05322265625, "completions/mean_terminated_length": 1430.4757080078125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.10805615161856015, "epoch": 0.18691588785046728, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.3821061422423806e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 686513401.0, "reward": 0.7746930718421936, "reward_std": 0.33800774812698364, "rewards/TRLRewardAdapter/mean": 0.7746930718421936, "rewards/TRLRewardAdapter/std": 0.33800771832466125, "sampling/importance_sampling_ratio/max": 2.607626438140869, "sampling/importance_sampling_ratio/mean": 0.21217791736125946, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.933712482452393, "sampling/sampling_logp_difference/mean": 0.015684176236391068, "step": 280, "step_time": 401.5517226041993 }, { "epoch": 0.18691588785046728, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.016739129979649315, "eval_completions/max_length": 9985.173913043478, "eval_completions/max_terminated_length": 9293.08695652174, "eval_completions/mean_length": 1833.2982283882473, "eval_completions/mean_terminated_length": 1694.2373046875, "eval_completions/min_length": 57.0, "eval_completions/min_terminated_length": 57.0, "eval_entropy": 0.1382943929537483, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 686513401.0, "eval_reward": 0.7089462228443312, "eval_reward_std": 0.37524975901064667, "eval_rewards/TRLRewardAdapter/mean": 0.7089462487593942, "eval_rewards/TRLRewardAdapter/std": 0.3752497654894124, "eval_runtime": 1420.5535, "eval_samples_per_second": 3.214, "eval_sampling/importance_sampling_ratio/max": 1.6600624115570732, "eval_sampling/importance_sampling_ratio/mean": 0.18698328020779983, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 12.649951416513193, "eval_sampling/sampling_logp_difference/mean": 0.01847402646165827, "eval_steps_per_second": 0.016, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9479.0, "completions/mean_length": 1774.701171875, "completions/mean_terminated_length": 1679.3603515625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.11289169390996297, "epoch": 0.1875834445927904, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00010446592065931904, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 688668410.0, "reward": 0.7098009586334229, "reward_std": 0.3887292742729187, "rewards/TRLRewardAdapter/mean": 0.7098008990287781, "rewards/TRLRewardAdapter/std": 0.3887292742729187, "sampling/importance_sampling_ratio/max": 2.432777166366577, "sampling/importance_sampling_ratio/mean": 0.13810035586357117, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.612423419952393, "sampling/sampling_logp_difference/mean": 0.015719646587967873, "step": 281, "step_time": 236.17086878593545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9871.0, "completions/mean_length": 1689.018798828125, "completions/mean_terminated_length": 1566.023193359375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.12497401610016823, "epoch": 0.1882510013351135, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00028970918103722793, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 690762988.0, "reward": 0.7836923003196716, "reward_std": 0.325212299823761, "rewards/TRLRewardAdapter/mean": 0.7836922407150269, "rewards/TRLRewardAdapter/std": 0.3252122700214386, "sampling/importance_sampling_ratio/max": 1.9688891172409058, "sampling/importance_sampling_ratio/mean": 0.2223399132490158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.625, "sampling/sampling_logp_difference/mean": 0.01751098595559597, "step": 282, "step_time": 319.4096165818628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9770.0, "completions/mean_length": 1704.127197265625, "completions/mean_terminated_length": 1518.5963134765625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.09435640648007393, "epoch": 0.18891855807743657, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.5089432325168798e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 692872870.0, "reward": 0.6602045297622681, "reward_std": 0.4161762297153473, "rewards/TRLRewardAdapter/mean": 0.6602045297622681, "rewards/TRLRewardAdapter/std": 0.4161762595176697, "sampling/importance_sampling_ratio/max": 1.5146187543869019, "sampling/importance_sampling_ratio/mean": 0.17781797051429749, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.36874771118164, "sampling/sampling_logp_difference/mean": 0.01350733358412981, "step": 283, "step_time": 303.1086556959199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9771.0, "completions/mean_length": 1917.873046875, "completions/mean_terminated_length": 1683.98486328125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.12287574261426926, "epoch": 0.18958611481975968, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00011322379043624696, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 695235948.0, "reward": 0.7345224022865295, "reward_std": 0.3541407585144043, "rewards/TRLRewardAdapter/mean": 0.7345224022865295, "rewards/TRLRewardAdapter/std": 0.3541407585144043, "sampling/importance_sampling_ratio/max": 2.8060495853424072, "sampling/importance_sampling_ratio/mean": 0.12157253175973892, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.528618812561035, "sampling/sampling_logp_difference/mean": 0.016967980191111565, "step": 284, "step_time": 292.7031805010047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9965.0, "completions/mean_length": 1260.0916748046875, "completions/mean_terminated_length": 1250.9781494140625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.10690330465634663, "epoch": 0.19025367156208278, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0005371294640089259, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 696896740.0, "reward": 0.7893272638320923, "reward_std": 0.3330768942832947, "rewards/TRLRewardAdapter/mean": 0.7893272638320923, "rewards/TRLRewardAdapter/std": 0.3330768942832947, "sampling/importance_sampling_ratio/max": 1.966488242149353, "sampling/importance_sampling_ratio/mean": 0.2179436981678009, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.306854248046875, "sampling/sampling_logp_difference/mean": 0.01572699472308159, "step": 285, "step_time": 219.034973138012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9836.0, "completions/mean_length": 1977.19384765625, "completions/mean_terminated_length": 1858.4630126953125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.10333067799607913, "epoch": 0.1909212283044059, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.006643311945785e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 699194526.0, "reward": 0.7333978414535522, "reward_std": 0.36783480644226074, "rewards/TRLRewardAdapter/mean": 0.7333977818489075, "rewards/TRLRewardAdapter/std": 0.36783477663993835, "sampling/importance_sampling_ratio/max": 1.5894172191619873, "sampling/importance_sampling_ratio/mean": 0.14274069666862488, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.139067649841309, "sampling/sampling_logp_difference/mean": 0.014552712440490723, "step": 286, "step_time": 252.77355951000936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9986.0, "completions/mean_length": 1962.8636474609375, "completions/mean_terminated_length": 1685.720947265625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.10164220134417216, "epoch": 0.19158878504672897, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 2.09954148653542e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 701523163.0, "reward": 0.7280219793319702, "reward_std": 0.38853955268859863, "rewards/TRLRewardAdapter/mean": 0.7280219197273254, "rewards/TRLRewardAdapter/std": 0.38853955268859863, "sampling/importance_sampling_ratio/max": 1.8647034168243408, "sampling/importance_sampling_ratio/mean": 0.19138243794441223, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.75, "sampling/sampling_logp_difference/mean": 0.014640224166214466, "step": 287, "step_time": 342.7276264372049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9929.0, "completions/mean_length": 2141.579345703125, "completions/mean_terminated_length": 2042.1053466796875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.11899746085206668, "epoch": 0.19225634178905207, "frac_reward_zero_std": 0.0, "grad_norm": 1.51981344205254e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 704067527.0, "reward": 0.7364770770072937, "reward_std": 0.3300555646419525, "rewards/TRLRewardAdapter/mean": 0.7364770174026489, "rewards/TRLRewardAdapter/std": 0.3300555646419525, "sampling/importance_sampling_ratio/max": 2.2067067623138428, "sampling/importance_sampling_ratio/mean": 0.14310625195503235, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.387245178222656, "sampling/sampling_logp_difference/mean": 0.016340944916009903, "step": 288, "step_time": 301.50097795494366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04583333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9994.0, "completions/mean_length": 2431.2333984375, "completions/mean_terminated_length": 2067.668212890625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.1025647980471452, "epoch": 0.19292389853137518, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 3.0174478911182774e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 706898983.0, "reward": 0.6203720569610596, "reward_std": 0.41456615924835205, "rewards/TRLRewardAdapter/mean": 0.6203719973564148, "rewards/TRLRewardAdapter/std": 0.41456615924835205, "sampling/importance_sampling_ratio/max": 1.4541593790054321, "sampling/importance_sampling_ratio/mean": 0.10818520933389664, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.5, "sampling/sampling_logp_difference/mean": 0.014418198727071285, "step": 289, "step_time": 364.7657838250743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 7872.0, "completions/mean_length": 1218.0677490234375, "completions/mean_terminated_length": 1208.9102783203125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.09761699289083481, "epoch": 0.19359145527369825, "frac_reward_zero_std": 0.0, "grad_norm": 0.0002631344706363635, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 708509224.0, "reward": 0.8409294486045837, "reward_std": 0.25718411803245544, "rewards/TRLRewardAdapter/mean": 0.840929388999939, "rewards/TRLRewardAdapter/std": 0.25718411803245544, "sampling/importance_sampling_ratio/max": 1.937679409980774, "sampling/importance_sampling_ratio/mean": 0.2096283882856369, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.0, "sampling/sampling_logp_difference/mean": 0.014043429866433144, "step": 290, "step_time": 99.90653180005029 }, { "epoch": 0.19359145527369825, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.018913043015029118, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9494.04347826087, "eval_completions/mean_length": 1978.6569240404212, "eval_completions/mean_terminated_length": 1824.1121507727582, "eval_completions/min_length": 56.82608695652174, "eval_completions/min_terminated_length": 56.82608695652174, "eval_entropy": 0.1349705062482668, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 708509224.0, "eval_reward": 0.7024224944736647, "eval_reward_std": 0.371480959913005, "eval_rewards/TRLRewardAdapter/mean": 0.7024225074311962, "eval_rewards/TRLRewardAdapter/std": 0.3714809650960176, "eval_runtime": 1426.0961, "eval_samples_per_second": 3.202, "eval_sampling/importance_sampling_ratio/max": 1.6431700768678084, "eval_sampling/importance_sampling_ratio/mean": 0.1778839593348296, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 8.130084151807038, "eval_sampling/sampling_logp_difference/mean": 0.018224084749817848, "eval_steps_per_second": 0.016, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9916.0, "completions/mean_length": 2253.735595703125, "completions/mean_terminated_length": 2139.09716796875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.11782154565056165, "epoch": 0.19425901201602136, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.162097903749782e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 711174506.0, "reward": 0.6940136551856995, "reward_std": 0.37632015347480774, "rewards/TRLRewardAdapter/mean": 0.6940135955810547, "rewards/TRLRewardAdapter/std": 0.37632015347480774, "sampling/importance_sampling_ratio/max": 1.0784353017807007, "sampling/importance_sampling_ratio/mean": 0.0876234918832779, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.75, "sampling/sampling_logp_difference/mean": 0.016957402229309082, "step": 291, "step_time": 300.5591160140466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9978.0, "completions/mean_length": 2388.720947265625, "completions/mean_terminated_length": 2075.02392578125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.10999281580249469, "epoch": 0.19492656875834447, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.8060437718701647e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 713948766.0, "reward": 0.733180582523346, "reward_std": 0.3414601981639862, "rewards/TRLRewardAdapter/mean": 0.733180582523346, "rewards/TRLRewardAdapter/std": 0.3414601683616638, "sampling/importance_sampling_ratio/max": 1.803954005241394, "sampling/importance_sampling_ratio/mean": 0.15315808355808258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.25, "sampling/sampling_logp_difference/mean": 0.015403145924210548, "step": 292, "step_time": 405.99677769711707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333767950535, "completions/max_length": 10000.0, "completions/max_terminated_length": 9719.0, "completions/mean_length": 1795.540771484375, "completions/mean_terminated_length": 1726.595703125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1101151891052723, "epoch": 0.19559412550066757, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.686826853937934e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 716136549.0, "reward": 0.7799509167671204, "reward_std": 0.31721630692481995, "rewards/TRLRewardAdapter/mean": 0.7799509167671204, "rewards/TRLRewardAdapter/std": 0.31721630692481995, "sampling/importance_sampling_ratio/max": 1.7817931175231934, "sampling/importance_sampling_ratio/mean": 0.12063653767108917, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.875, "sampling/sampling_logp_difference/mean": 0.015627238899469376, "step": 293, "step_time": 257.1110651210183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333767950535, "completions/max_length": 10000.0, "completions/max_terminated_length": 9878.0, "completions/mean_length": 2222.229248046875, "completions/mean_terminated_length": 2156.869873046875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.09786522264281909, "epoch": 0.19626168224299065, "frac_reward_zero_std": 0.0, "grad_norm": 0.00016035331305651443, "learning_rate": 5e-06, "loss": -0.0003, "num_tokens": 718688193.0, "reward": 0.7052214741706848, "reward_std": 0.36044177412986755, "rewards/TRLRewardAdapter/mean": 0.70522141456604, "rewards/TRLRewardAdapter/std": 0.36044177412986755, "sampling/importance_sampling_ratio/max": 1.421708583831787, "sampling/importance_sampling_ratio/mean": 0.1181941032409668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.625, "sampling/sampling_logp_difference/mean": 0.013697223737835884, "step": 294, "step_time": 291.91782705602236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333395421505, "completions/max_length": 10000.0, "completions/max_terminated_length": 9871.0, "completions/mean_length": 2036.5980224609375, "completions/mean_terminated_length": 1867.163818359375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.10734442993998528, "epoch": 0.19692923898531375, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.34380144879898e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 721053439.0, "reward": 0.760565459728241, "reward_std": 0.3163716495037079, "rewards/TRLRewardAdapter/mean": 0.760565459728241, "rewards/TRLRewardAdapter/std": 0.3163716197013855, "sampling/importance_sampling_ratio/max": 2.4954898357391357, "sampling/importance_sampling_ratio/mean": 0.13628315925598145, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.0, "sampling/sampling_logp_difference/mean": 0.015144869685173035, "step": 295, "step_time": 335.83585277898237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9646.0, "completions/mean_length": 1810.6927490234375, "completions/mean_terminated_length": 1582.724853515625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.108238502095143, "epoch": 0.19759679572763686, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.7954794132101414e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 723313240.0, "reward": 0.7617979645729065, "reward_std": 0.3436415493488312, "rewards/TRLRewardAdapter/mean": 0.7617979049682617, "rewards/TRLRewardAdapter/std": 0.3436415493488312, "sampling/importance_sampling_ratio/max": 1.606345534324646, "sampling/importance_sampling_ratio/mean": 0.13092930614948273, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.5, "sampling/sampling_logp_difference/mean": 0.015647169202566147, "step": 296, "step_time": 343.97549177496694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9106.0, "completions/mean_length": 1662.5750732421875, "completions/mean_terminated_length": 1439.649169921875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.10516721134384473, "epoch": 0.19826435246995994, "frac_reward_zero_std": 0.0, "grad_norm": 1.948154781166584e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 725413280.0, "reward": 0.7541065812110901, "reward_std": 0.34785404801368713, "rewards/TRLRewardAdapter/mean": 0.7541065216064453, "rewards/TRLRewardAdapter/std": 0.34785404801368713, "sampling/importance_sampling_ratio/max": 2.3593225479125977, "sampling/importance_sampling_ratio/mean": 0.17629274725914001, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.5, "sampling/sampling_logp_difference/mean": 0.015210035257041454, "step": 297, "step_time": 305.4411997089628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9875.0, "completions/mean_length": 2179.70849609375, "completions/mean_terminated_length": 1996.2900390625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.10708245138327281, "epoch": 0.19893190921228304, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 2.5069352229338365e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 727987624.0, "reward": 0.6983985900878906, "reward_std": 0.37873750925064087, "rewards/TRLRewardAdapter/mean": 0.6983985304832458, "rewards/TRLRewardAdapter/std": 0.37873753905296326, "sampling/importance_sampling_ratio/max": 2.9625864028930664, "sampling/importance_sampling_ratio/mean": 0.1818811297416687, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.134025573730469, "sampling/sampling_logp_difference/mean": 0.015537147410213947, "step": 298, "step_time": 304.5043096371228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08125000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9910.0, "completions/mean_length": 2375.72509765625, "completions/mean_terminated_length": 1701.4693603515625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.09730868910749753, "epoch": 0.19959946595460615, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 3.7104020135746155e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 730692864.0, "reward": 0.6575818061828613, "reward_std": 0.4202057719230652, "rewards/TRLRewardAdapter/mean": 0.6575817465782166, "rewards/TRLRewardAdapter/std": 0.4202057719230652, "sampling/importance_sampling_ratio/max": 2.2161448001861572, "sampling/importance_sampling_ratio/mean": 0.22500436007976532, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.25, "sampling/sampling_logp_difference/mean": 0.013592058792710304, "step": 299, "step_time": 399.9799558450468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9989.0, "completions/mean_length": 1684.81884765625, "completions/mean_terminated_length": 1579.563232421875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.10858343914151192, "epoch": 0.20026702269692923, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001306947133167222, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 732742034.0, "reward": 0.7771788239479065, "reward_std": 0.3245118260383606, "rewards/TRLRewardAdapter/mean": 0.7771787643432617, "rewards/TRLRewardAdapter/std": 0.3245118260383606, "sampling/importance_sampling_ratio/max": 2.9473679065704346, "sampling/importance_sampling_ratio/mean": 0.12557853758335114, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.68871307373047, "sampling/sampling_logp_difference/mean": 0.015683284029364586, "step": 300, "step_time": 276.1945032570511 }, { "epoch": 0.20026702269692923, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.017826086477093075, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9307.826086956522, "eval_completions/mean_length": 1934.3262514860733, "eval_completions/mean_terminated_length": 1788.2049507472825, "eval_completions/min_length": 56.21739130434783, "eval_completions/min_terminated_length": 56.21739130434783, "eval_entropy": 0.13091751447190408, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 732742034.0, "eval_reward": 0.7025077990863634, "eval_reward_std": 0.37347308837849164, "eval_rewards/TRLRewardAdapter/mean": 0.7025078094523886, "eval_rewards/TRLRewardAdapter/std": 0.37347308837849164, "eval_runtime": 1410.1843, "eval_samples_per_second": 3.238, "eval_sampling/importance_sampling_ratio/max": 1.6452701143596484, "eval_sampling/importance_sampling_ratio/mean": 0.18183852278667947, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 10.120422891948534, "eval_sampling/sampling_logp_difference/mean": 0.0177623493111004, "eval_steps_per_second": 0.016, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7447.0, "completions/max_terminated_length": 7447.0, "completions/mean_length": 1112.1031494140625, "completions/mean_terminated_length": 1112.1031494140625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.09371868272622426, "epoch": 0.20093457943925233, "frac_reward_zero_std": 0.0, "grad_norm": 0.00013246752570666694, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 734349749.0, "reward": 0.8718351125717163, "reward_std": 0.1956755816936493, "rewards/TRLRewardAdapter/mean": 0.8718350529670715, "rewards/TRLRewardAdapter/std": 0.1956755816936493, "sampling/importance_sampling_ratio/max": 2.257770538330078, "sampling/importance_sampling_ratio/mean": 0.15717639029026031, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.0, "sampling/sampling_logp_difference/mean": 0.013737666420638561, "step": 301, "step_time": 81.85522327700164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07604166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9896.0, "completions/mean_length": 1884.353271484375, "completions/mean_terminated_length": 1216.436279296875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.09998659541209538, "epoch": 0.20160213618157544, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00029079257122865737, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 736649032.0, "reward": 0.7525711059570312, "reward_std": 0.358489990234375, "rewards/TRLRewardAdapter/mean": 0.7525710463523865, "rewards/TRLRewardAdapter/std": 0.358489990234375, "sampling/importance_sampling_ratio/max": 2.189220905303955, "sampling/importance_sampling_ratio/mean": 0.18926319479942322, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.5, "sampling/sampling_logp_difference/mean": 0.014394660480320454, "step": 302, "step_time": 348.260880479007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9756.0, "completions/mean_length": 1983.0084228515625, "completions/mean_terminated_length": 1786.2198486328125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1012981819609801, "epoch": 0.20226969292389854, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.001048743020140143, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 738972400.0, "reward": 0.700247585773468, "reward_std": 0.37068191170692444, "rewards/TRLRewardAdapter/mean": 0.700247585773468, "rewards/TRLRewardAdapter/std": 0.37068191170692444, "sampling/importance_sampling_ratio/max": 1.5652391910552979, "sampling/importance_sampling_ratio/mean": 0.12927336990833282, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.517156600952148, "sampling/sampling_logp_difference/mean": 0.014497852884232998, "step": 303, "step_time": 273.74277070420794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07083334028720856, "completions/max_length": 10000.0, "completions/max_terminated_length": 9995.0, "completions/mean_length": 2320.105224609375, "completions/mean_terminated_length": 1734.6424560546875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.09457909936706226, "epoch": 0.20293724966622162, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.210710890614191e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 741617589.0, "reward": 0.738425612449646, "reward_std": 0.36928844451904297, "rewards/TRLRewardAdapter/mean": 0.7384255528450012, "rewards/TRLRewardAdapter/std": 0.36928844451904297, "sampling/importance_sampling_ratio/max": 1.7157522439956665, "sampling/importance_sampling_ratio/mean": 0.14002734422683716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.0, "sampling/sampling_logp_difference/mean": 0.013515541329979897, "step": 304, "step_time": 395.5306137611624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9565.0, "completions/mean_length": 1593.3126220703125, "completions/mean_terminated_length": 1405.303466796875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.0956262859205405, "epoch": 0.20360480640854473, "frac_reward_zero_std": 0.0, "grad_norm": 0.00020498760185877775, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 743561057.0, "reward": 0.7802461385726929, "reward_std": 0.32869333028793335, "rewards/TRLRewardAdapter/mean": 0.7802460789680481, "rewards/TRLRewardAdapter/std": 0.32869333028793335, "sampling/importance_sampling_ratio/max": 2.2061874866485596, "sampling/importance_sampling_ratio/mean": 0.1362752616405487, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.705451965332031, "sampling/sampling_logp_difference/mean": 0.013812852092087269, "step": 305, "step_time": 262.4923269440187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05937500298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9997.0, "completions/mean_length": 2047.486572265625, "completions/mean_terminated_length": 1545.5006103515625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.0920707955956459, "epoch": 0.20427236315086783, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0004168348191555049, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 745980884.0, "reward": 0.7054135799407959, "reward_std": 0.40979936718940735, "rewards/TRLRewardAdapter/mean": 0.7054135799407959, "rewards/TRLRewardAdapter/std": 0.40979936718940735, "sampling/importance_sampling_ratio/max": 2.942514419555664, "sampling/importance_sampling_ratio/mean": 0.240927055478096, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.75, "sampling/sampling_logp_difference/mean": 0.013871627859771252, "step": 306, "step_time": 360.0371124398662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9886.0, "completions/mean_length": 1931.8646240234375, "completions/mean_terminated_length": 1803.7989501953125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.10419184093674023, "epoch": 0.2049399198931909, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00022004617447412007, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 748336018.0, "reward": 0.6660276651382446, "reward_std": 0.3975760042667389, "rewards/TRLRewardAdapter/mean": 0.6660276651382446, "rewards/TRLRewardAdapter/std": 0.3975760042667389, "sampling/importance_sampling_ratio/max": 2.3010358810424805, "sampling/importance_sampling_ratio/mean": 0.11421068012714386, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.20545196533203, "sampling/sampling_logp_difference/mean": 0.014530137181282043, "step": 307, "step_time": 262.5047838008031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9906.0, "completions/mean_length": 1851.6053466796875, "completions/mean_terminated_length": 1808.9434814453125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.09615873048702876, "epoch": 0.205607476635514, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0003722070010980717, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 750608631.0, "reward": 0.7767348885536194, "reward_std": 0.32986316084861755, "rewards/TRLRewardAdapter/mean": 0.7767348885536194, "rewards/TRLRewardAdapter/std": 0.32986316084861755, "sampling/importance_sampling_ratio/max": 2.6345999240875244, "sampling/importance_sampling_ratio/mean": 0.167377308011055, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.5, "sampling/sampling_logp_difference/mean": 0.014032064937055111, "step": 308, "step_time": 253.12095715885516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9941.0, "completions/mean_length": 2546.85009765625, "completions/mean_terminated_length": 2264.8388671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.11821780602137248, "epoch": 0.20627503337783712, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 1.3170496800687292e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 753490055.0, "reward": 0.5404131412506104, "reward_std": 0.4172389805316925, "rewards/TRLRewardAdapter/mean": 0.5404130816459656, "rewards/TRLRewardAdapter/std": 0.41723892092704773, "sampling/importance_sampling_ratio/max": 1.4580450057983398, "sampling/importance_sampling_ratio/mean": 0.09388913959264755, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.25, "sampling/sampling_logp_difference/mean": 0.01564604975283146, "step": 309, "step_time": 338.82647415890824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9204.0, "completions/mean_length": 1561.7032470703125, "completions/mean_terminated_length": 1481.8453369140625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.09929702803492546, "epoch": 0.20694259012016022, "frac_reward_zero_std": 0.0, "grad_norm": 0.00015564302285773807, "learning_rate": 5e-06, "loss": 0.0009, "num_tokens": 755467946.0, "reward": 0.7591413259506226, "reward_std": 0.3384576141834259, "rewards/TRLRewardAdapter/mean": 0.7591412663459778, "rewards/TRLRewardAdapter/std": 0.3384576141834259, "sampling/importance_sampling_ratio/max": 1.520285725593567, "sampling/importance_sampling_ratio/mean": 0.1691083461046219, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.469564437866211, "sampling/sampling_logp_difference/mean": 0.01402970403432846, "step": 310, "step_time": 186.12574516504537 }, { "epoch": 0.20694259012016022, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.019130434112056442, "eval_completions/max_length": 9998.695652173914, "eval_completions/max_terminated_length": 9656.347826086956, "eval_completions/mean_length": 1941.7601796025815, "eval_completions/mean_terminated_length": 1784.7737453294837, "eval_completions/min_length": 54.65217391304348, "eval_completions/min_terminated_length": 54.65217391304348, "eval_entropy": 0.1298241498677627, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 755467946.0, "eval_reward": 0.699274682480356, "eval_reward_std": 0.3730618513148764, "eval_rewards/TRLRewardAdapter/mean": 0.6992747006209, "eval_rewards/TRLRewardAdapter/std": 0.3730618513148764, "eval_runtime": 1422.5284, "eval_samples_per_second": 3.21, "eval_sampling/importance_sampling_ratio/max": 1.8323951441308726, "eval_sampling/importance_sampling_ratio/mean": 0.1845878109983776, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.193245276160862, "eval_sampling/sampling_logp_difference/mean": 0.017426891900275066, "eval_steps_per_second": 0.016, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01979166828095913, "completions/max_length": 10000.0, "completions/max_terminated_length": 9944.0, "completions/mean_length": 2560.97412109375, "completions/mean_terminated_length": 2410.7705078125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.09950154026349385, "epoch": 0.2076101468624833, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.0510773452327364e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 758345809.0, "reward": 0.6829601526260376, "reward_std": 0.3658391237258911, "rewards/TRLRewardAdapter/mean": 0.6829601526260376, "rewards/TRLRewardAdapter/std": 0.3658391237258911, "sampling/importance_sampling_ratio/max": 2.1477506160736084, "sampling/importance_sampling_ratio/mean": 0.10449697077274323, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.0, "sampling/sampling_logp_difference/mean": 0.01412828266620636, "step": 311, "step_time": 331.5604370579822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9782.0, "completions/mean_length": 1769.6334228515625, "completions/mean_terminated_length": 1612.365234375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.10397279510895412, "epoch": 0.2082777036048064, "frac_reward_zero_std": 0.0, "grad_norm": 0.0001187376250047252, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 760422065.0, "reward": 0.7353810667991638, "reward_std": 0.36850106716156006, "rewards/TRLRewardAdapter/mean": 0.735381007194519, "rewards/TRLRewardAdapter/std": 0.36850106716156006, "sampling/importance_sampling_ratio/max": 2.435037851333618, "sampling/importance_sampling_ratio/mean": 0.12392375618219376, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.889067649841309, "sampling/sampling_logp_difference/mean": 0.014994464814662933, "step": 312, "step_time": 243.5100998210255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9441.0, "completions/mean_length": 1501.737548828125, "completions/mean_terminated_length": 1492.8759765625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.10231960068146388, "epoch": 0.2089452603471295, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.958686305581468e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 762337173.0, "reward": 0.7210914492607117, "reward_std": 0.3631596863269806, "rewards/TRLRewardAdapter/mean": 0.7210913896560669, "rewards/TRLRewardAdapter/std": 0.3631596565246582, "sampling/importance_sampling_ratio/max": 2.6292572021484375, "sampling/importance_sampling_ratio/mean": 0.16656053066253662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.0361480712890625, "sampling/sampling_logp_difference/mean": 0.01455700397491455, "step": 313, "step_time": 222.82447400409728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04895833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9998.0, "completions/mean_length": 1930.7781982421875, "completions/mean_terminated_length": 1515.3856201171875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.0883619524538517, "epoch": 0.2096128170894526, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00013984736742912832, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 764668032.0, "reward": 0.7929713129997253, "reward_std": 0.3021608293056488, "rewards/TRLRewardAdapter/mean": 0.7929713129997253, "rewards/TRLRewardAdapter/std": 0.3021608293056488, "sampling/importance_sampling_ratio/max": 1.8746856451034546, "sampling/importance_sampling_ratio/mean": 0.18686965107917786, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.705451965332031, "sampling/sampling_logp_difference/mean": 0.012704277411103249, "step": 314, "step_time": 392.83745688502677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09479167312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9839.0, "completions/mean_length": 2616.92626953125, "completions/mean_terminated_length": 1843.7847900390625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.09777299066384633, "epoch": 0.2102803738317757, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 9.685042278455952e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 767651161.0, "reward": 0.679030179977417, "reward_std": 0.3740423321723938, "rewards/TRLRewardAdapter/mean": 0.679030179977417, "rewards/TRLRewardAdapter/std": 0.3740423321723938, "sampling/importance_sampling_ratio/max": 2.079751491546631, "sampling/importance_sampling_ratio/mean": 0.11346697807312012, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.5, "sampling/sampling_logp_difference/mean": 0.013860500417649746, "step": 315, "step_time": 395.29386340873316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333767950535, "completions/max_length": 10000.0, "completions/max_terminated_length": 9690.0, "completions/mean_length": 1713.55419921875, "completions/mean_terminated_length": 1643.9202880859375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.10366478934884071, "epoch": 0.2109479305740988, "frac_reward_zero_std": 0.0, "grad_norm": 4.747062514414401e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 769718637.0, "reward": 0.7644399404525757, "reward_std": 0.34488967061042786, "rewards/TRLRewardAdapter/mean": 0.7644398808479309, "rewards/TRLRewardAdapter/std": 0.34488967061042786, "sampling/importance_sampling_ratio/max": 2.1799259185791016, "sampling/importance_sampling_ratio/mean": 0.17290851473808289, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.75, "sampling/sampling_logp_difference/mean": 0.01463338453322649, "step": 316, "step_time": 203.72274958493654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9868.0, "completions/mean_length": 2444.30322265625, "completions/mean_terminated_length": 2124.355224609375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.11091287558277448, "epoch": 0.2116154873164219, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.0180620529934608e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 772515824.0, "reward": 0.6883836388587952, "reward_std": 0.3832058608531952, "rewards/TRLRewardAdapter/mean": 0.6883836388587952, "rewards/TRLRewardAdapter/std": 0.3832058012485504, "sampling/importance_sampling_ratio/max": 1.9734209775924683, "sampling/importance_sampling_ratio/mean": 0.10358522087335587, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.05926513671875, "sampling/sampling_logp_difference/mean": 0.015416703186929226, "step": 317, "step_time": 358.7336107851006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04270833730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9943.0, "completions/mean_length": 1754.38232421875, "completions/mean_terminated_length": 1386.5146484375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.0965503379702568, "epoch": 0.21228304405874499, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00031549588426241427, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 774682111.0, "reward": 0.7829747200012207, "reward_std": 0.3300350606441498, "rewards/TRLRewardAdapter/mean": 0.7829746603965759, "rewards/TRLRewardAdapter/std": 0.3300350606441498, "sampling/importance_sampling_ratio/max": 2.037757396697998, "sampling/importance_sampling_ratio/mean": 0.17959393560886383, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.639068603515625, "sampling/sampling_logp_difference/mean": 0.013918578624725342, "step": 318, "step_time": 283.367317216238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250002793967724, "completions/max_length": 10000.0, "completions/max_terminated_length": 9826.0, "completions/mean_length": 1550.0865478515625, "completions/mean_terminated_length": 1523.59765625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.08609992886583011, "epoch": 0.2129506008010681, "frac_reward_zero_std": 0.0, "grad_norm": 3.964720282258147e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 776572658.0, "reward": 0.8225895762443542, "reward_std": 0.2649379372596741, "rewards/TRLRewardAdapter/mean": 0.8225895762443542, "rewards/TRLRewardAdapter/std": 0.2649379074573517, "sampling/importance_sampling_ratio/max": 2.674725294113159, "sampling/importance_sampling_ratio/mean": 0.12616734206676483, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.112422943115234, "sampling/sampling_logp_difference/mean": 0.012735745869576931, "step": 319, "step_time": 185.1704116249457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04583333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9930.0, "completions/mean_length": 2612.95849609375, "completions/mean_terminated_length": 2258.122314453125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.10868696868419647, "epoch": 0.2136181575433912, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00021411950827535475, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 779569066.0, "reward": 0.6923909187316895, "reward_std": 0.36985573172569275, "rewards/TRLRewardAdapter/mean": 0.6923908591270447, "rewards/TRLRewardAdapter/std": 0.36985573172569275, "sampling/importance_sampling_ratio/max": 2.267881393432617, "sampling/importance_sampling_ratio/mean": 0.0952213928103447, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.25, "sampling/sampling_logp_difference/mean": 0.015212601982057095, "step": 320, "step_time": 395.47266579500865 }, { "epoch": 0.2136181575433912, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.019999999552965164, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9365.217391304348, "eval_completions/mean_length": 1741.158012058424, "eval_completions/mean_terminated_length": 1572.327827785326, "eval_completions/min_length": 51.04347826086956, "eval_completions/min_terminated_length": 51.04347826086956, "eval_entropy": 0.12628621316474417, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 779569066.0, "eval_reward": 0.6963908154031505, "eval_reward_std": 0.3840089595836142, "eval_rewards/TRLRewardAdapter/mean": 0.6963908231776693, "eval_rewards/TRLRewardAdapter/std": 0.3840089634708736, "eval_runtime": 1403.0768, "eval_samples_per_second": 3.254, "eval_sampling/importance_sampling_ratio/max": 1.6757786429446677, "eval_sampling/importance_sampling_ratio/mean": 0.21752562276695087, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.25979443218397, "eval_sampling/sampling_logp_difference/mean": 0.017025981827274612, "eval_steps_per_second": 0.016, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05416667088866234, "completions/max_length": 10000.0, "completions/max_terminated_length": 9939.0, "completions/mean_length": 3102.237548828125, "completions/mean_terminated_length": 2707.21142578125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1187516413629055, "epoch": 0.21428571428571427, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 1.7206840979941895e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 783035374.0, "reward": 0.5158903002738953, "reward_std": 0.44454652070999146, "rewards/TRLRewardAdapter/mean": 0.5158902406692505, "rewards/TRLRewardAdapter/std": 0.44454649090766907, "sampling/importance_sampling_ratio/max": 1.7050336599349976, "sampling/importance_sampling_ratio/mean": 0.07478219270706177, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.1461067199707, "sampling/sampling_logp_difference/mean": 0.016480354592204094, "step": 321, "step_time": 324.33244097582065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9921.0, "completions/mean_length": 2024.2427978515625, "completions/mean_terminated_length": 1793.432861328125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.10391741742690404, "epoch": 0.21495327102803738, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.20750428940982e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 785514423.0, "reward": 0.7526126503944397, "reward_std": 0.33708491921424866, "rewards/TRLRewardAdapter/mean": 0.7526126503944397, "rewards/TRLRewardAdapter/std": 0.33708491921424866, "sampling/importance_sampling_ratio/max": 1.5544379949569702, "sampling/importance_sampling_ratio/mean": 0.1203264445066452, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.705451965332031, "sampling/sampling_logp_difference/mean": 0.014918230473995209, "step": 322, "step_time": 325.5981641468825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07083334028720856, "completions/max_length": 10000.0, "completions/max_terminated_length": 9897.0, "completions/mean_length": 2631.17822265625, "completions/mean_terminated_length": 2069.429443359375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.09168566639224689, "epoch": 0.21562082777036048, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.00016550270254905275, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 788501858.0, "reward": 0.6590148210525513, "reward_std": 0.4105294942855835, "rewards/TRLRewardAdapter/mean": 0.6590147614479065, "rewards/TRLRewardAdapter/std": 0.4105294346809387, "sampling/importance_sampling_ratio/max": 2.3580141067504883, "sampling/importance_sampling_ratio/mean": 0.1690845787525177, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.406776428222656, "sampling/sampling_logp_difference/mean": 0.013228707015514374, "step": 323, "step_time": 358.2171625409974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9991.0, "completions/mean_length": 2270.77099609375, "completions/mean_terminated_length": 1986.9761962890625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.10941057528058688, "epoch": 0.2162883845126836, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 2.9769739436933876e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 791104966.0, "reward": 0.6710265874862671, "reward_std": 0.4001048803329468, "rewards/TRLRewardAdapter/mean": 0.6710265278816223, "rewards/TRLRewardAdapter/std": 0.4001048803329468, "sampling/importance_sampling_ratio/max": 2.1252901554107666, "sampling/importance_sampling_ratio/mean": 0.16682331264019012, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.0, "sampling/sampling_logp_difference/mean": 0.01496621873229742, "step": 324, "step_time": 265.1962076300988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9983.0, "completions/mean_length": 1693.3489990234375, "completions/mean_terminated_length": 1379.0433349609375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.10158506159981091, "epoch": 0.21695594125500667, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.000431559373954973, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 793144213.0, "reward": 0.7488828897476196, "reward_std": 0.377636194229126, "rewards/TRLRewardAdapter/mean": 0.7488828897476196, "rewards/TRLRewardAdapter/std": 0.377636194229126, "sampling/importance_sampling_ratio/max": 2.0246670246124268, "sampling/importance_sampling_ratio/mean": 0.1878536492586136, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.326438903808594, "sampling/sampling_logp_difference/mean": 0.014406590722501278, "step": 325, "step_time": 238.46075395378284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9746.0, "completions/mean_length": 1836.8958740234375, "completions/mean_terminated_length": 1716.0887451171875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.10838529591759045, "epoch": 0.21762349799732977, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.2027703643942926e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 795350289.0, "reward": 0.7370970845222473, "reward_std": 0.36484524607658386, "rewards/TRLRewardAdapter/mean": 0.7370970249176025, "rewards/TRLRewardAdapter/std": 0.36484524607658386, "sampling/importance_sampling_ratio/max": 2.2141005992889404, "sampling/importance_sampling_ratio/mean": 0.16093119978904724, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.59498405456543, "sampling/sampling_logp_difference/mean": 0.015358611941337585, "step": 326, "step_time": 226.44723719719332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9296.0, "completions/mean_length": 1421.919921875, "completions/mean_terminated_length": 1377.0084228515625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.10490456223487854, "epoch": 0.21829105473965288, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0004407370645107769, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 797148740.0, "reward": 0.754063606262207, "reward_std": 0.34642475843429565, "rewards/TRLRewardAdapter/mean": 0.7540635466575623, "rewards/TRLRewardAdapter/std": 0.34642475843429565, "sampling/importance_sampling_ratio/max": 2.067284345626831, "sampling/importance_sampling_ratio/mean": 0.1551162451505661, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.281775951385498, "sampling/sampling_logp_difference/mean": 0.014844253659248352, "step": 327, "step_time": 187.4237482282333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07083334028720856, "completions/max_length": 10000.0, "completions/max_terminated_length": 9979.0, "completions/mean_length": 2206.003173828125, "completions/mean_terminated_length": 1611.842041015625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.10141642019152641, "epoch": 0.21895861148197596, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0005475368184976881, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 799665799.0, "reward": 0.7140475511550903, "reward_std": 0.37346190214157104, "rewards/TRLRewardAdapter/mean": 0.7140474915504456, "rewards/TRLRewardAdapter/std": 0.37346187233924866, "sampling/importance_sampling_ratio/max": 1.612217664718628, "sampling/importance_sampling_ratio/mean": 0.19139894843101501, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.887245178222656, "sampling/sampling_logp_difference/mean": 0.014350979588925838, "step": 328, "step_time": 375.65703316906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9435.0, "completions/mean_length": 2002.13134765625, "completions/mean_terminated_length": 1663.4593505859375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1059787670771281, "epoch": 0.21962616822429906, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0004153217141021022, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 802074629.0, "reward": 0.6841700673103333, "reward_std": 0.4027036428451538, "rewards/TRLRewardAdapter/mean": 0.6841700077056885, "rewards/TRLRewardAdapter/std": 0.4027036428451538, "sampling/importance_sampling_ratio/max": 2.3738980293273926, "sampling/importance_sampling_ratio/mean": 0.12649157643318176, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.598222255706787, "sampling/sampling_logp_difference/mean": 0.01447667833417654, "step": 329, "step_time": 346.02976318006404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 10000.0, "completions/max_terminated_length": 9834.0, "completions/mean_length": 1577.63232421875, "completions/mean_terminated_length": 1488.9757080078125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.10453488181034724, "epoch": 0.22029372496662217, "frac_reward_zero_std": 0.0, "grad_norm": 0.0001921543093676613, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 803990820.0, "reward": 0.8119446039199829, "reward_std": 0.30548205971717834, "rewards/TRLRewardAdapter/mean": 0.8119446039199829, "rewards/TRLRewardAdapter/std": 0.30548205971717834, "sampling/importance_sampling_ratio/max": 2.7707951068878174, "sampling/importance_sampling_ratio/mean": 0.1526930332183838, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.969564437866211, "sampling/sampling_logp_difference/mean": 0.01512165553867817, "step": 330, "step_time": 297.1410278099356 }, { "epoch": 0.22029372496662217, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.022608694863384186, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9196.869565217392, "eval_completions/mean_length": 1679.4401696246603, "eval_completions/mean_terminated_length": 1487.0829441236413, "eval_completions/min_length": 46.21739130434783, "eval_completions/min_terminated_length": 46.21739130434783, "eval_entropy": 0.13093598672877188, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 803990820.0, "eval_reward": 0.6885355296342269, "eval_reward_std": 0.3920143881569738, "eval_rewards/TRLRewardAdapter/mean": 0.6885355503662772, "eval_rewards/TRLRewardAdapter/std": 0.39201438945272693, "eval_runtime": 1411.7792, "eval_samples_per_second": 3.234, "eval_sampling/importance_sampling_ratio/max": 1.8152945819108381, "eval_sampling/importance_sampling_ratio/mean": 0.24847112790397977, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.598424828570822, "eval_sampling/sampling_logp_difference/mean": 0.017497203596260235, "eval_steps_per_second": 0.016, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9906.0, "completions/mean_length": 1711.181396484375, "completions/mean_terminated_length": 1693.8768310546875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.11430738990505536, "epoch": 0.22096128170894527, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0005167956280475763, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 806146354.0, "reward": 0.7947856187820435, "reward_std": 0.297523558139801, "rewards/TRLRewardAdapter/mean": 0.7947855591773987, "rewards/TRLRewardAdapter/std": 0.297523558139801, "sampling/importance_sampling_ratio/max": 2.1497926712036133, "sampling/importance_sampling_ratio/mean": 0.14952746033668518, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.25, "sampling/sampling_logp_difference/mean": 0.016129275783896446, "step": 331, "step_time": 249.90913660498336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01666666753590107, "completions/max_length": 10000.0, "completions/max_terminated_length": 9775.0, "completions/mean_length": 1953.924072265625, "completions/mean_terminated_length": 1817.5498046875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.11262395853797595, "epoch": 0.22162883845126835, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0003006793659117356, "learning_rate": 5e-06, "loss": 0.0006, "num_tokens": 808488457.0, "reward": 0.6723732352256775, "reward_std": 0.40506666898727417, "rewards/TRLRewardAdapter/mean": 0.6723731756210327, "rewards/TRLRewardAdapter/std": 0.4050666391849518, "sampling/importance_sampling_ratio/max": 1.9856936931610107, "sampling/importance_sampling_ratio/mean": 0.21119622886180878, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.892158508300781, "sampling/sampling_logp_difference/mean": 0.01616116426885128, "step": 332, "step_time": 298.74203094490804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9916.0, "completions/mean_length": 1687.7396240234375, "completions/mean_terminated_length": 1192.306884765625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.09616716330250104, "epoch": 0.22229639519359146, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00010364927991924652, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 810543951.0, "reward": 0.7872012257575989, "reward_std": 0.3322116732597351, "rewards/TRLRewardAdapter/mean": 0.7872011661529541, "rewards/TRLRewardAdapter/std": 0.3322116732597351, "sampling/importance_sampling_ratio/max": 2.039047956466675, "sampling/importance_sampling_ratio/mean": 0.1671074628829956, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.748069763183594, "sampling/sampling_logp_difference/mean": 0.013727816753089428, "step": 333, "step_time": 383.27524130709935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013541667722165585, "completions/max_length": 10000.0, "completions/max_terminated_length": 9488.0, "completions/mean_length": 1716.2177734375, "completions/mean_terminated_length": 1602.5015869140625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1110846425096194, "epoch": 0.22296395193591456, "frac_reward_zero_std": 0.0, "grad_norm": 0.0001384592288489382, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 812634048.0, "reward": 0.787259578704834, "reward_std": 0.31135186553001404, "rewards/TRLRewardAdapter/mean": 0.7872595191001892, "rewards/TRLRewardAdapter/std": 0.3113518953323364, "sampling/importance_sampling_ratio/max": 1.4319279193878174, "sampling/importance_sampling_ratio/mean": 0.156551331281662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.601531982421875, "sampling/sampling_logp_difference/mean": 0.015573202632367611, "step": 334, "step_time": 201.59245887724683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9697.0, "completions/mean_length": 1925.2115478515625, "completions/mean_terminated_length": 1882.93505859375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.128117007513841, "epoch": 0.22363150867823764, "frac_reward_zero_std": 0.0, "grad_norm": 0.00010501669688507905, "learning_rate": 5e-06, "loss": 0.0022, "num_tokens": 814901259.0, "reward": 0.7323259115219116, "reward_std": 0.3214132785797119, "rewards/TRLRewardAdapter/mean": 0.7323258519172668, "rewards/TRLRewardAdapter/std": 0.3214132785797119, "sampling/importance_sampling_ratio/max": 2.5492923259735107, "sampling/importance_sampling_ratio/mean": 0.1458236426115036, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.356596946716309, "sampling/sampling_logp_difference/mean": 0.016979176551103592, "step": 335, "step_time": 158.91421883902512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9898.0, "completions/mean_length": 2163.822021484375, "completions/mean_terminated_length": 1867.31787109375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.12343500802914302, "epoch": 0.22429906542056074, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.0731499749527445e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 817474240.0, "reward": 0.7099234461784363, "reward_std": 0.38396117091178894, "rewards/TRLRewardAdapter/mean": 0.7099234461784363, "rewards/TRLRewardAdapter/std": 0.38396117091178894, "sampling/importance_sampling_ratio/max": 2.393326759338379, "sampling/importance_sampling_ratio/mean": 0.16618125140666962, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.53517150878906, "sampling/sampling_logp_difference/mean": 0.0171926561743021, "step": 336, "step_time": 256.6471797480481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9765.0, "completions/mean_length": 1546.9927978515625, "completions/mean_terminated_length": 1348.734619140625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1023904134829839, "epoch": 0.22496662216288385, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0001975264045911629, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 819409209.0, "reward": 0.812597393989563, "reward_std": 0.2929944097995758, "rewards/TRLRewardAdapter/mean": 0.8125973343849182, "rewards/TRLRewardAdapter/std": 0.2929944396018982, "sampling/importance_sampling_ratio/max": 2.168818235397339, "sampling/importance_sampling_ratio/mean": 0.1937912404537201, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.25, "sampling/sampling_logp_difference/mean": 0.014927709475159645, "step": 337, "step_time": 329.90549651382025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9967.0, "completions/mean_length": 2678.55322265625, "completions/mean_terminated_length": 2335.2353515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.11772703255216281, "epoch": 0.22563417890520696, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00021839811716180073, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 822502956.0, "reward": 0.6325283646583557, "reward_std": 0.4135013520717621, "rewards/TRLRewardAdapter/mean": 0.6325283050537109, "rewards/TRLRewardAdapter/std": 0.4135013520717621, "sampling/importance_sampling_ratio/max": 1.9407854080200195, "sampling/importance_sampling_ratio/mean": 0.1388053297996521, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.5625, "sampling/sampling_logp_difference/mean": 0.016517939046025276, "step": 338, "step_time": 345.7666780540021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9887.0, "completions/mean_length": 1604.557373046875, "completions/mean_terminated_length": 1370.8511962890625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.10546848053733508, "epoch": 0.22630173564753003, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001395289057905417, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 824492579.0, "reward": 0.7647187113761902, "reward_std": 0.34906792640686035, "rewards/TRLRewardAdapter/mean": 0.7647187113761902, "rewards/TRLRewardAdapter/std": 0.34906789660453796, "sampling/importance_sampling_ratio/max": 2.340996026992798, "sampling/importance_sampling_ratio/mean": 0.22360830008983612, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.0, "sampling/sampling_logp_difference/mean": 0.01473576482385397, "step": 339, "step_time": 309.51778552588075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9999.0, "completions/mean_length": 2145.25, "completions/mean_terminated_length": 1776.9246826171875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.10379100839296977, "epoch": 0.22696929238985314, "frac_reward_zero_std": 0.0, "grad_norm": 0.0002664844517591628, "learning_rate": 5e-06, "loss": 0.0008, "num_tokens": 827009107.0, "reward": 0.6698890328407288, "reward_std": 0.39395618438720703, "rewards/TRLRewardAdapter/mean": 0.6698890328407288, "rewards/TRLRewardAdapter/std": 0.39395618438720703, "sampling/importance_sampling_ratio/max": 1.712769865989685, "sampling/importance_sampling_ratio/mean": 0.20941023528575897, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.25, "sampling/sampling_logp_difference/mean": 0.014570489525794983, "step": 340, "step_time": 341.01591805613134 }, { "epoch": 0.22696929238985314, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.019565216994479946, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8925.0, "eval_completions/mean_length": 1363.6088761039402, "eval_completions/mean_terminated_length": 1191.3130970830503, "eval_completions/min_length": 44.26086956521739, "eval_completions/min_terminated_length": 44.26086956521739, "eval_entropy": 0.1363375164244486, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 827009107.0, "eval_reward": 0.6686303745145383, "eval_reward_std": 0.4147705370965211, "eval_rewards/TRLRewardAdapter/mean": 0.668630397838095, "eval_rewards/TRLRewardAdapter/std": 0.4147705370965211, "eval_runtime": 1403.4429, "eval_samples_per_second": 3.253, "eval_sampling/importance_sampling_ratio/max": 1.9660521071890127, "eval_sampling/importance_sampling_ratio/mean": 0.33457874603893445, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.299169146496316, "eval_sampling/sampling_logp_difference/mean": 0.01806003705638906, "eval_steps_per_second": 0.016, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9757.0, "completions/mean_length": 2377.42626953125, "completions/mean_terminated_length": 2080.442626953125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.12293744459748268, "epoch": 0.22763684913217624, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.679476607830292e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 829715692.0, "reward": 0.6453534960746765, "reward_std": 0.39842161536216736, "rewards/TRLRewardAdapter/mean": 0.6453534364700317, "rewards/TRLRewardAdapter/std": 0.39842161536216736, "sampling/importance_sampling_ratio/max": 1.3500069379806519, "sampling/importance_sampling_ratio/mean": 0.11502180248498917, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.656776428222656, "sampling/sampling_logp_difference/mean": 0.016479311510920525, "step": 341, "step_time": 287.44653078797273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05000000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9993.0, "completions/mean_length": 1818.904296875, "completions/mean_terminated_length": 1388.3201904296875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.10970323160290718, "epoch": 0.22830440587449932, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00023423856694939447, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 831900656.0, "reward": 0.7314399480819702, "reward_std": 0.38275906443595886, "rewards/TRLRewardAdapter/mean": 0.7314398884773254, "rewards/TRLRewardAdapter/std": 0.3827590346336365, "sampling/importance_sampling_ratio/max": 1.4386348724365234, "sampling/importance_sampling_ratio/mean": 0.19588816165924072, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 62.0, "sampling/sampling_logp_difference/mean": 0.014873548410832882, "step": 342, "step_time": 376.3740388189908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9916.0, "completions/mean_length": 2138.736572265625, "completions/mean_terminated_length": 1814.7364501953125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.11614190166195233, "epoch": 0.22897196261682243, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0007065659127880114, "learning_rate": 5e-06, "loss": 0.0014, "num_tokens": 834429811.0, "reward": 0.6185291409492493, "reward_std": 0.41528213024139404, "rewards/TRLRewardAdapter/mean": 0.6185291409492493, "rewards/TRLRewardAdapter/std": 0.41528210043907166, "sampling/importance_sampling_ratio/max": 2.192384719848633, "sampling/importance_sampling_ratio/mean": 0.1679522842168808, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.75, "sampling/sampling_logp_difference/mean": 0.016091082245111465, "step": 343, "step_time": 384.4770694161998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9967.0, "completions/mean_length": 1994.52099609375, "completions/mean_terminated_length": 1850.2015380859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.1333464433749517, "epoch": 0.22963951935914553, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.001276343913316877, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 836851431.0, "reward": 0.7229328155517578, "reward_std": 0.35997435450553894, "rewards/TRLRewardAdapter/mean": 0.722932755947113, "rewards/TRLRewardAdapter/std": 0.35997435450553894, "sampling/importance_sampling_ratio/max": 1.7190277576446533, "sampling/importance_sampling_ratio/mean": 0.17261621356010437, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.75, "sampling/sampling_logp_difference/mean": 0.018375366926193237, "step": 344, "step_time": 334.4334642729955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9978.0, "completions/mean_length": 1585.9771728515625, "completions/mean_terminated_length": 1323.88623046875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.11355810612440109, "epoch": 0.23030707610146864, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0004998112519615589, "learning_rate": 5e-06, "loss": 0.0019, "num_tokens": 838807281.0, "reward": 0.7186142206192017, "reward_std": 0.38562047481536865, "rewards/TRLRewardAdapter/mean": 0.7186142206192017, "rewards/TRLRewardAdapter/std": 0.38562044501304626, "sampling/importance_sampling_ratio/max": 1.5710340738296509, "sampling/importance_sampling_ratio/mean": 0.22883126139640808, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.131656646728516, "sampling/sampling_logp_difference/mean": 0.01573476754128933, "step": 345, "step_time": 319.548228747095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9992.0, "completions/mean_length": 2724.750244140625, "completions/mean_terminated_length": 2506.18017578125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.14136363317569098, "epoch": 0.23097463284379172, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.080572258097688e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 841906945.0, "reward": 0.673337459564209, "reward_std": 0.3850811719894409, "rewards/TRLRewardAdapter/mean": 0.6733373999595642, "rewards/TRLRewardAdapter/std": 0.3850811719894409, "sampling/importance_sampling_ratio/max": 1.6514900922775269, "sampling/importance_sampling_ratio/mean": 0.17282992601394653, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.25, "sampling/sampling_logp_difference/mean": 0.019511645659804344, "step": 346, "step_time": 301.49431834497955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9995.0, "completions/mean_length": 2041.947998046875, "completions/mean_terminated_length": 1829.165771484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.13297562301158905, "epoch": 0.23164218958611482, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.579383109577412e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 844310415.0, "reward": 0.7001851201057434, "reward_std": 0.38877367973327637, "rewards/TRLRewardAdapter/mean": 0.7001850605010986, "rewards/TRLRewardAdapter/std": 0.38877370953559875, "sampling/importance_sampling_ratio/max": 2.1705262660980225, "sampling/importance_sampling_ratio/mean": 0.20685473084449768, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.963586807250977, "sampling/sampling_logp_difference/mean": 0.018198197707533836, "step": 347, "step_time": 260.7450239841128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250002793967724, "completions/max_length": 10000.0, "completions/max_terminated_length": 9746.0, "completions/mean_length": 1102.339599609375, "completions/mean_terminated_length": 1074.447265625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.11333346366882324, "epoch": 0.23230974632843793, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0002279954994031621, "learning_rate": 5e-06, "loss": -0.0003, "num_tokens": 845753141.0, "reward": 0.8601921796798706, "reward_std": 0.2562459409236908, "rewards/TRLRewardAdapter/mean": 0.8601921200752258, "rewards/TRLRewardAdapter/std": 0.2562459111213684, "sampling/importance_sampling_ratio/max": 1.656379222869873, "sampling/importance_sampling_ratio/mean": 0.21028709411621094, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.705451965332031, "sampling/sampling_logp_difference/mean": 0.016491001471877098, "step": 348, "step_time": 202.3181457798928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03854166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9599.0, "completions/mean_length": 2035.8250732421875, "completions/mean_terminated_length": 1716.5677490234375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.13909355302651724, "epoch": 0.232977303070761, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 8.597829663801544e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 848188973.0, "reward": 0.7115203142166138, "reward_std": 0.3933531641960144, "rewards/TRLRewardAdapter/mean": 0.711520254611969, "rewards/TRLRewardAdapter/std": 0.393353134393692, "sampling/importance_sampling_ratio/max": 2.8254058361053467, "sampling/importance_sampling_ratio/mean": 0.18634216487407684, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.889068603515625, "sampling/sampling_logp_difference/mean": 0.019264576956629753, "step": 349, "step_time": 276.7927411380224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04270833730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9891.0, "completions/mean_length": 1991.8668212890625, "completions/mean_terminated_length": 1634.5941162109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.13202410688002905, "epoch": 0.2336448598130841, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.5278912545119678e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 850550285.0, "reward": 0.7061288356781006, "reward_std": 0.3922370970249176, "rewards/TRLRewardAdapter/mean": 0.7061288356781006, "rewards/TRLRewardAdapter/std": 0.3922370970249176, "sampling/importance_sampling_ratio/max": 2.9627997875213623, "sampling/importance_sampling_ratio/mean": 0.14909811317920685, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.01682662963867, "sampling/sampling_logp_difference/mean": 0.018754715099930763, "step": 350, "step_time": 330.9446811849484 }, { "epoch": 0.2336448598130841, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.01739130387811557, "eval_completions/max_length": 9934.95652173913, "eval_completions/max_terminated_length": 8753.0, "eval_completions/mean_length": 1220.9580157736073, "eval_completions/mean_terminated_length": 1065.4657407014267, "eval_completions/min_length": 43.869565217391305, "eval_completions/min_terminated_length": 43.869565217391305, "eval_entropy": 0.14931210292422253, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 850550285.0, "eval_reward": 0.6547403491061666, "eval_reward_std": 0.4262373278970304, "eval_rewards/TRLRewardAdapter/mean": 0.6547403724297233, "eval_rewards/TRLRewardAdapter/std": 0.4262373304885367, "eval_runtime": 1368.6587, "eval_samples_per_second": 3.336, "eval_sampling/importance_sampling_ratio/max": 2.1241061791129736, "eval_sampling/importance_sampling_ratio/mean": 0.362956324349279, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 10.83530472672504, "eval_sampling/sampling_logp_difference/mean": 0.01981212474081827, "eval_steps_per_second": 0.017, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9961.0, "completions/mean_length": 1817.290771484375, "completions/mean_terminated_length": 1562.405029296875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.12520866096019745, "epoch": 0.23431241655540722, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.0004102146018310035, "learning_rate": 5e-06, "loss": 0.0017, "num_tokens": 852706948.0, "reward": 0.6161428689956665, "reward_std": 0.4352540969848633, "rewards/TRLRewardAdapter/mean": 0.6161428689956665, "rewards/TRLRewardAdapter/std": 0.4352540969848633, "sampling/importance_sampling_ratio/max": 1.80019211769104, "sampling/importance_sampling_ratio/mean": 0.2325478047132492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.75, "sampling/sampling_logp_difference/mean": 0.017227737233042717, "step": 351, "step_time": 264.3557600622298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9859.0, "completions/mean_length": 1701.675048828125, "completions/mean_terminated_length": 1658.228271484375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.13404163469870886, "epoch": 0.2349799732977303, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.540867674800156e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 854781996.0, "reward": 0.7395737767219543, "reward_std": 0.36316126585006714, "rewards/TRLRewardAdapter/mean": 0.7395737171173096, "rewards/TRLRewardAdapter/std": 0.36316123604774475, "sampling/importance_sampling_ratio/max": 2.537388563156128, "sampling/importance_sampling_ratio/mean": 0.17159411311149597, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.25, "sampling/sampling_logp_difference/mean": 0.018519176170229912, "step": 352, "step_time": 207.02315141703002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9909.0, "completions/mean_length": 2183.8251953125, "completions/mean_terminated_length": 1914.3017578125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.14316587646802267, "epoch": 0.2356475300400534, "frac_reward_zero_std": 0.0, "grad_norm": 0.00017501388878264607, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 857350980.0, "reward": 0.7274529933929443, "reward_std": 0.36119571328163147, "rewards/TRLRewardAdapter/mean": 0.7274529933929443, "rewards/TRLRewardAdapter/std": 0.36119571328163147, "sampling/importance_sampling_ratio/max": 2.1846439838409424, "sampling/importance_sampling_ratio/mean": 0.16682612895965576, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.0, "sampling/sampling_logp_difference/mean": 0.019993921741843224, "step": 353, "step_time": 354.47062009596266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9508.0, "completions/mean_length": 1678.674072265625, "completions/mean_terminated_length": 1428.677001953125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.13562311480442682, "epoch": 0.2363150867823765, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.863346888084661e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 859357195.0, "reward": 0.7362613081932068, "reward_std": 0.36792442202568054, "rewards/TRLRewardAdapter/mean": 0.736261248588562, "rewards/TRLRewardAdapter/std": 0.36792442202568054, "sampling/importance_sampling_ratio/max": 1.5882370471954346, "sampling/importance_sampling_ratio/mean": 0.14124226570129395, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.137245178222656, "sampling/sampling_logp_difference/mean": 0.018185675144195557, "step": 354, "step_time": 302.6195867699571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 9997.0, "completions/mean_length": 1383.5865478515625, "completions/mean_terminated_length": 1329.3951416015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.12846249714493752, "epoch": 0.2369826435246996, "frac_reward_zero_std": 0.0, "grad_norm": 4.8723397451165743e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 861086718.0, "reward": 0.8255972862243652, "reward_std": 0.30315932631492615, "rewards/TRLRewardAdapter/mean": 0.8255972266197205, "rewards/TRLRewardAdapter/std": 0.30315932631492615, "sampling/importance_sampling_ratio/max": 2.5608973503112793, "sampling/importance_sampling_ratio/mean": 0.2574557662010193, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.884025573730469, "sampling/sampling_logp_difference/mean": 0.018607450649142265, "step": 355, "step_time": 259.70201629714575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9888.0, "completions/mean_length": 1873.4583740234375, "completions/mean_terminated_length": 1647.2376708984375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1320402647058169, "epoch": 0.2376502002670227, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 5.272497687066297e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 863309910.0, "reward": 0.7105941772460938, "reward_std": 0.3835107982158661, "rewards/TRLRewardAdapter/mean": 0.710594117641449, "rewards/TRLRewardAdapter/std": 0.3835107982158661, "sampling/importance_sampling_ratio/max": 2.254347562789917, "sampling/importance_sampling_ratio/mean": 0.14407117664813995, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.0, "sampling/sampling_logp_difference/mean": 0.017888760194182396, "step": 356, "step_time": 239.33708252594806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9793.0, "completions/mean_length": 1401.72607421875, "completions/mean_terminated_length": 1190.6690673828125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.12448605770866077, "epoch": 0.2383177570093458, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001926863464830754, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 865070159.0, "reward": 0.8065359592437744, "reward_std": 0.3256911337375641, "rewards/TRLRewardAdapter/mean": 0.8065358996391296, "rewards/TRLRewardAdapter/std": 0.3256911039352417, "sampling/importance_sampling_ratio/max": 1.9851510524749756, "sampling/importance_sampling_ratio/mean": 0.22039350867271423, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.25, "sampling/sampling_logp_difference/mean": 0.01764789968729019, "step": 357, "step_time": 308.76672888884787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9893.0, "completions/mean_length": 1381.315673828125, "completions/mean_terminated_length": 1216.62744140625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1401604861021042, "epoch": 0.2389853137516689, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00014779115020880778, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 866873694.0, "reward": 0.765040397644043, "reward_std": 0.3560459017753601, "rewards/TRLRewardAdapter/mean": 0.7650403380393982, "rewards/TRLRewardAdapter/std": 0.3560459017753601, "sampling/importance_sampling_ratio/max": 2.649456024169922, "sampling/importance_sampling_ratio/mean": 0.13200777769088745, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.469562530517578, "sampling/sampling_logp_difference/mean": 0.019342178478837013, "step": 358, "step_time": 212.97583388595376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9928.0, "completions/mean_length": 1406.9990234375, "completions/mean_terminated_length": 1177.2396240234375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.11873867983619373, "epoch": 0.23965287049399198, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00017359977173399827, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 868647453.0, "reward": 0.754093587398529, "reward_std": 0.3906100392341614, "rewards/TRLRewardAdapter/mean": 0.7540935277938843, "rewards/TRLRewardAdapter/std": 0.3906100392341614, "sampling/importance_sampling_ratio/max": 2.2563538551330566, "sampling/importance_sampling_ratio/mean": 0.2627699375152588, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.5, "sampling/sampling_logp_difference/mean": 0.01707237772643566, "step": 359, "step_time": 330.3579585030675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9893.0, "completions/mean_length": 1400.6219482421875, "completions/mean_terminated_length": 1113.6673583984375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.11045852924386661, "epoch": 0.24032042723631508, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.0002079579494627486, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 870424626.0, "reward": 0.7380775809288025, "reward_std": 0.3774445056915283, "rewards/TRLRewardAdapter/mean": 0.7380775213241577, "rewards/TRLRewardAdapter/std": 0.3774445354938507, "sampling/importance_sampling_ratio/max": 2.552074432373047, "sampling/importance_sampling_ratio/mean": 0.2531130909919739, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.4913330078125, "sampling/sampling_logp_difference/mean": 0.01603877544403076, "step": 360, "step_time": 260.4986584360013 }, { "epoch": 0.24032042723631508, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.01543478214222452, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8110.04347826087, "eval_completions/mean_length": 1012.9917151409646, "eval_completions/mean_terminated_length": 872.1429284137229, "eval_completions/min_length": 42.21739130434783, "eval_completions/min_terminated_length": 42.21739130434783, "eval_entropy": 0.14778215794459634, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 870424626.0, "eval_reward": 0.6494331644928973, "eval_reward_std": 0.43829307089681213, "eval_rewards/TRLRewardAdapter/mean": 0.6494331748589225, "eval_rewards/TRLRewardAdapter/std": 0.4382930773755778, "eval_runtime": 1372.255, "eval_samples_per_second": 3.327, "eval_sampling/importance_sampling_ratio/max": 2.176101021144701, "eval_sampling/importance_sampling_ratio/mean": 0.3793244271174721, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 6.464524393496306, "eval_sampling/sampling_logp_difference/mean": 0.019843799021580944, "eval_steps_per_second": 0.017, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9583.0, "completions/mean_length": 1330.4417724609375, "completions/mean_terminated_length": 1285.05126953125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.12741938357551894, "epoch": 0.2409879839786382, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.0009493488790001777, "learning_rate": 5e-06, "loss": 0.0004, "num_tokens": 872224538.0, "reward": 0.6310896873474121, "reward_std": 0.42943936586380005, "rewards/TRLRewardAdapter/mean": 0.6310896277427673, "rewards/TRLRewardAdapter/std": 0.42943933606147766, "sampling/importance_sampling_ratio/max": 2.833554267883301, "sampling/importance_sampling_ratio/mean": 0.2028043270111084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.535170555114746, "sampling/sampling_logp_difference/mean": 0.017911778762936592, "step": 361, "step_time": 168.31216456077527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9466.0, "completions/mean_length": 1579.4229736328125, "completions/mean_terminated_length": 1289.0582275390625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.13620174676179886, "epoch": 0.2416555407209613, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00032180120586578257, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 874217296.0, "reward": 0.733893096446991, "reward_std": 0.37597817182540894, "rewards/TRLRewardAdapter/mean": 0.733893096446991, "rewards/TRLRewardAdapter/std": 0.3759782016277313, "sampling/importance_sampling_ratio/max": 2.0492587089538574, "sampling/importance_sampling_ratio/mean": 0.1048356294631958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.656388282775879, "sampling/sampling_logp_difference/mean": 0.019514653831720352, "step": 362, "step_time": 375.23942524706945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9959.0, "completions/mean_length": 1769.8209228515625, "completions/mean_terminated_length": 1576.7889404296875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.12552577132980028, "epoch": 0.24232309746328437, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 7.532441041140184e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 876352996.0, "reward": 0.6642109155654907, "reward_std": 0.40695634484291077, "rewards/TRLRewardAdapter/mean": 0.6642109155654907, "rewards/TRLRewardAdapter/std": 0.4069563150405884, "sampling/importance_sampling_ratio/max": 1.9355859756469727, "sampling/importance_sampling_ratio/mean": 0.14920559525489807, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.5, "sampling/sampling_logp_difference/mean": 0.017496414482593536, "step": 363, "step_time": 231.85506577289198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9210.0, "completions/mean_length": 1466.4271240234375, "completions/mean_terminated_length": 1358.4071044921875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.1255921684205532, "epoch": 0.24299065420560748, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.0002455195777988184, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 878242110.0, "reward": 0.6692106127738953, "reward_std": 0.4204270839691162, "rewards/TRLRewardAdapter/mean": 0.6692105531692505, "rewards/TRLRewardAdapter/std": 0.4204271137714386, "sampling/importance_sampling_ratio/max": 2.832261800765991, "sampling/importance_sampling_ratio/mean": 0.22300605475902557, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.5, "sampling/sampling_logp_difference/mean": 0.017766326665878296, "step": 364, "step_time": 192.19582971918862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04375000298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9991.0, "completions/mean_length": 1399.3875732421875, "completions/mean_terminated_length": 1005.8953857421875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.10435495898127556, "epoch": 0.24365821094793058, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00122379595150848, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 880040690.0, "reward": 0.7741227746009827, "reward_std": 0.3663959801197052, "rewards/TRLRewardAdapter/mean": 0.7741227746009827, "rewards/TRLRewardAdapter/std": 0.3663959503173828, "sampling/importance_sampling_ratio/max": 2.854332208633423, "sampling/importance_sampling_ratio/mean": 0.30882713198661804, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.0, "sampling/sampling_logp_difference/mean": 0.01571706123650074, "step": 365, "step_time": 394.4810718666995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9777.0, "completions/mean_length": 1489.4647216796875, "completions/mean_terminated_length": 1167.4443359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.11945288255810738, "epoch": 0.24432576769025366, "frac_reward_zero_std": 0.23333334922790527, "grad_norm": 4.262182502974078e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 881921360.0, "reward": 0.645667314529419, "reward_std": 0.4385000169277191, "rewards/TRLRewardAdapter/mean": 0.6456672549247742, "rewards/TRLRewardAdapter/std": 0.43849998712539673, "sampling/importance_sampling_ratio/max": 2.588698148727417, "sampling/importance_sampling_ratio/mean": 0.23819732666015625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.75, "sampling/sampling_logp_difference/mean": 0.01702318899333477, "step": 366, "step_time": 290.443426001817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9874.0, "completions/mean_length": 1898.38134765625, "completions/mean_terminated_length": 1555.3160400390625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.13739087929328284, "epoch": 0.24499332443257676, "frac_reward_zero_std": 0.0, "grad_norm": 0.00021947431650526185, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 884203838.0, "reward": 0.6976116895675659, "reward_std": 0.4013725817203522, "rewards/TRLRewardAdapter/mean": 0.6976116895675659, "rewards/TRLRewardAdapter/std": 0.4013725519180298, "sampling/importance_sampling_ratio/max": 2.2604215145111084, "sampling/importance_sampling_ratio/mean": 0.1659325361251831, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.40638732910156, "sampling/sampling_logp_difference/mean": 0.0194308552891016, "step": 367, "step_time": 325.4613282548962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9643.0, "completions/mean_length": 1352.643798828125, "completions/mean_terminated_length": 1149.8272705078125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.13472397128740946, "epoch": 0.24566088117489987, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001711672973941245, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 885971176.0, "reward": 0.7750689387321472, "reward_std": 0.3600175380706787, "rewards/TRLRewardAdapter/mean": 0.7750689387321472, "rewards/TRLRewardAdapter/std": 0.3600175380706787, "sampling/importance_sampling_ratio/max": 1.8514795303344727, "sampling/importance_sampling_ratio/mean": 0.23152145743370056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.0, "sampling/sampling_logp_difference/mean": 0.018696732819080353, "step": 368, "step_time": 309.8933251998387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05833333730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 8871.0, "completions/mean_length": 1931.3511962890625, "completions/mean_terminated_length": 1431.523193359375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1436427185932795, "epoch": 0.24632843791722298, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 7.011779140747186e-05, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 888226553.0, "reward": 0.7359805703163147, "reward_std": 0.3765852153301239, "rewards/TRLRewardAdapter/mean": 0.7359805703163147, "rewards/TRLRewardAdapter/std": 0.3765852153301239, "sampling/importance_sampling_ratio/max": 1.9028465747833252, "sampling/importance_sampling_ratio/mean": 0.18743281066417694, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.0, "sampling/sampling_logp_difference/mean": 0.019459163770079613, "step": 369, "step_time": 284.7481839960674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9526.0, "completions/mean_length": 1438.0167236328125, "completions/mean_terminated_length": 1065.7564697265625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.12859858324130377, "epoch": 0.24699599465954605, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0002543823350367404, "learning_rate": 5e-06, "loss": -0.0003, "num_tokens": 890026729.0, "reward": 0.7450841665267944, "reward_std": 0.3793600797653198, "rewards/TRLRewardAdapter/mean": 0.7450841069221497, "rewards/TRLRewardAdapter/std": 0.37936004996299744, "sampling/importance_sampling_ratio/max": 1.9422801733016968, "sampling/importance_sampling_ratio/mean": 0.1754659116268158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.137245178222656, "sampling/sampling_logp_difference/mean": 0.017581382766366005, "step": 370, "step_time": 319.0533270899905 }, { "epoch": 0.24699599465954605, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.01956521675152623, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8595.391304347826, "eval_completions/mean_length": 1091.2369305154552, "eval_completions/mean_terminated_length": 913.3799199643342, "eval_completions/min_length": 42.65217391304348, "eval_completions/min_terminated_length": 42.65217391304348, "eval_entropy": 0.15972497994485108, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 890026729.0, "eval_reward": 0.6501307072846786, "eval_reward_std": 0.43588489164476807, "eval_rewards/TRLRewardAdapter/mean": 0.6501307254252227, "eval_rewards/TRLRewardAdapter/std": 0.4358848942362744, "eval_runtime": 1371.2543, "eval_samples_per_second": 3.33, "eval_sampling/importance_sampling_ratio/max": 1.9864731975223706, "eval_sampling/importance_sampling_ratio/mean": 0.3642306807248489, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 8.964183797007022, "eval_sampling/sampling_logp_difference/mean": 0.021180871226217434, "eval_steps_per_second": 0.017, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05104167014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9953.0, "completions/mean_length": 2176.7177734375, "completions/mean_terminated_length": 1755.926513671875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.14406026899814606, "epoch": 0.24766355140186916, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 8.07598103396848e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 892550426.0, "reward": 0.7196031212806702, "reward_std": 0.3774089813232422, "rewards/TRLRewardAdapter/mean": 0.7196031212806702, "rewards/TRLRewardAdapter/std": 0.3774089813232422, "sampling/importance_sampling_ratio/max": 1.9085787534713745, "sampling/importance_sampling_ratio/mean": 0.19788451492786407, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.559269905090332, "sampling/sampling_logp_difference/mean": 0.02025892399251461, "step": 371, "step_time": 352.3917192228837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 10000.0, "completions/max_terminated_length": 9937.0, "completions/mean_length": 1276.6385498046875, "completions/mean_terminated_length": 1184.8135986328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.12987594306468964, "epoch": 0.24833110814419226, "frac_reward_zero_std": 0.0, "grad_norm": 1.9281188436474192e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 894195967.0, "reward": 0.8034877181053162, "reward_std": 0.34727323055267334, "rewards/TRLRewardAdapter/mean": 0.8034876585006714, "rewards/TRLRewardAdapter/std": 0.34727320075035095, "sampling/importance_sampling_ratio/max": 1.5741980075836182, "sampling/importance_sampling_ratio/mean": 0.2691190242767334, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.28517150878906, "sampling/sampling_logp_difference/mean": 0.018529701977968216, "step": 372, "step_time": 271.7317273950903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04583333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9950.0, "completions/mean_length": 2101.040771484375, "completions/mean_terminated_length": 1721.6146240234375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.1590020606915156, "epoch": 0.24899866488651534, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00010294107793836717, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 896649926.0, "reward": 0.6788924336433411, "reward_std": 0.40693163871765137, "rewards/TRLRewardAdapter/mean": 0.6788923740386963, "rewards/TRLRewardAdapter/std": 0.40693163871765137, "sampling/importance_sampling_ratio/max": 2.356010675430298, "sampling/importance_sampling_ratio/mean": 0.16186407208442688, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.5, "sampling/sampling_logp_difference/mean": 0.02139914594590664, "step": 373, "step_time": 349.3840790211689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7202.0, "completions/max_terminated_length": 7202.0, "completions/mean_length": 780.268798828125, "completions/mean_terminated_length": 780.268798828125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.12646987165013948, "epoch": 0.24966622162883845, "frac_reward_zero_std": 0.0, "grad_norm": 0.00013045813688583545, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 897825256.0, "reward": 0.8793486952781677, "reward_std": 0.23487135767936707, "rewards/TRLRewardAdapter/mean": 0.879348635673523, "rewards/TRLRewardAdapter/std": 0.23487135767936707, "sampling/importance_sampling_ratio/max": 2.563138723373413, "sampling/importance_sampling_ratio/mean": 0.22278806567192078, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.546110153198242, "sampling/sampling_logp_difference/mean": 0.01785392500460148, "step": 374, "step_time": 96.94627083977684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9945.0, "completions/mean_length": 1855.951171875, "completions/mean_terminated_length": 1313.0145263671875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.1341493104894956, "epoch": 0.25033377837116155, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.1897006324520635e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 900084249.0, "reward": 0.7653608322143555, "reward_std": 0.3656857907772064, "rewards/TRLRewardAdapter/mean": 0.7653607726097107, "rewards/TRLRewardAdapter/std": 0.3656857907772064, "sampling/importance_sampling_ratio/max": 1.7895448207855225, "sampling/importance_sampling_ratio/mean": 0.2244529277086258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.25, "sampling/sampling_logp_difference/mean": 0.01857566088438034, "step": 375, "step_time": 369.20787573501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9669.0, "completions/mean_length": 1690.9688720703125, "completions/mean_terminated_length": 1559.079345703125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.11678779125213623, "epoch": 0.25100133511348466, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0018782334361082224, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 902189531.0, "reward": 0.6823432445526123, "reward_std": 0.40899714827537537, "rewards/TRLRewardAdapter/mean": 0.6823432445526123, "rewards/TRLRewardAdapter/std": 0.40899714827537537, "sampling/importance_sampling_ratio/max": 2.125499725341797, "sampling/importance_sampling_ratio/mean": 0.19758561253547668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.836272239685059, "sampling/sampling_logp_difference/mean": 0.016621176153421402, "step": 376, "step_time": 190.0303479740396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9964.0, "completions/mean_length": 2072.27197265625, "completions/mean_terminated_length": 1920.7867431640625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.15465720991293588, "epoch": 0.25166889185580776, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.2812784538310404e-05, "learning_rate": 5e-06, "loss": 0.0004, "num_tokens": 904620480.0, "reward": 0.7118815183639526, "reward_std": 0.37264326214790344, "rewards/TRLRewardAdapter/mean": 0.7118815183639526, "rewards/TRLRewardAdapter/std": 0.37264329195022583, "sampling/importance_sampling_ratio/max": 1.7843348979949951, "sampling/importance_sampling_ratio/mean": 0.10885202884674072, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.273773193359375, "sampling/sampling_logp_difference/mean": 0.020947663113474846, "step": 377, "step_time": 281.14050943707116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06041666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9847.0, "completions/mean_length": 2074.032470703125, "completions/mean_terminated_length": 1564.3802490234375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.13521029303471246, "epoch": 0.2523364485981308, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.650773105917391e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 907085791.0, "reward": 0.7166579961776733, "reward_std": 0.3877881169319153, "rewards/TRLRewardAdapter/mean": 0.7166579365730286, "rewards/TRLRewardAdapter/std": 0.3877881169319153, "sampling/importance_sampling_ratio/max": 2.213341236114502, "sampling/importance_sampling_ratio/mean": 0.20329007506370544, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.3624210357666, "sampling/sampling_logp_difference/mean": 0.018558727577328682, "step": 378, "step_time": 283.9504918382736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9961.0, "completions/mean_length": 1894.1417236328125, "completions/mean_terminated_length": 1748.0128173828125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.14966544757286707, "epoch": 0.2530040053404539, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 6.105431645202359e-05, "learning_rate": 5e-06, "loss": 0.003, "num_tokens": 909345607.0, "reward": 0.6986480951309204, "reward_std": 0.3750589191913605, "rewards/TRLRewardAdapter/mean": 0.6986480951309204, "rewards/TRLRewardAdapter/std": 0.3750589191913605, "sampling/importance_sampling_ratio/max": 2.6818807125091553, "sampling/importance_sampling_ratio/mean": 0.1963980495929718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.5, "sampling/sampling_logp_difference/mean": 0.01989619992673397, "step": 379, "step_time": 245.04537532303948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9824.0, "completions/mean_length": 1838.6292724609375, "completions/mean_terminated_length": 1656.1064453125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.14940202484528223, "epoch": 0.253671562082777, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 4.731805682938416e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 911577219.0, "reward": 0.7031553983688354, "reward_std": 0.4031859040260315, "rewards/TRLRewardAdapter/mean": 0.7031553983688354, "rewards/TRLRewardAdapter/std": 0.4031859040260315, "sampling/importance_sampling_ratio/max": 2.1324005126953125, "sampling/importance_sampling_ratio/mean": 0.20278063416481018, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.5, "sampling/sampling_logp_difference/mean": 0.020965853706002235, "step": 380, "step_time": 277.9992950861342 }, { "epoch": 0.253671562082777, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.018043477776581825, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8549.347826086956, "eval_completions/mean_length": 1093.4693285071332, "eval_completions/mean_terminated_length": 930.0583814538044, "eval_completions/min_length": 42.608695652173914, "eval_completions/min_terminated_length": 42.608695652173914, "eval_entropy": 0.15507311341555222, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 911577219.0, "eval_reward": 0.6482417168824569, "eval_reward_std": 0.43553035933038464, "eval_rewards/TRLRewardAdapter/mean": 0.6482417220654695, "eval_rewards/TRLRewardAdapter/std": 0.43553035933038464, "eval_runtime": 1372.0149, "eval_samples_per_second": 3.328, "eval_sampling/importance_sampling_ratio/max": 1.9031142566515051, "eval_sampling/importance_sampling_ratio/mean": 0.36837883487991663, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 12.951594819193302, "eval_sampling/sampling_logp_difference/mean": 0.02080158864998299, "eval_steps_per_second": 0.017, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9739.0, "completions/max_terminated_length": 9739.0, "completions/mean_length": 1086.3125, "completions/mean_terminated_length": 1086.3125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.11626867577433586, "epoch": 0.25433911882510013, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.000804994812370417, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 913093199.0, "reward": 0.7921163439750671, "reward_std": 0.3477799892425537, "rewards/TRLRewardAdapter/mean": 0.7921162843704224, "rewards/TRLRewardAdapter/std": 0.3477799892425537, "sampling/importance_sampling_ratio/max": 2.2704732418060303, "sampling/importance_sampling_ratio/mean": 0.21486973762512207, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.955453872680664, "sampling/sampling_logp_difference/mean": 0.016971806064248085, "step": 381, "step_time": 149.24424150492996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 8887.0, "completions/mean_length": 1121.1552734375, "completions/mean_terminated_length": 980.22119140625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.13593050092458725, "epoch": 0.25500667556742324, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0014400824252248457, "learning_rate": 5e-06, "loss": 0.0027, "num_tokens": 914610852.0, "reward": 0.7866131067276001, "reward_std": 0.34916025400161743, "rewards/TRLRewardAdapter/mean": 0.7866131067276001, "rewards/TRLRewardAdapter/std": 0.34916022419929504, "sampling/importance_sampling_ratio/max": 2.2979559898376465, "sampling/importance_sampling_ratio/mean": 0.20804493129253387, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.125, "sampling/sampling_logp_difference/mean": 0.018508320674300194, "step": 382, "step_time": 256.0380858631106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06875000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9976.0, "completions/mean_length": 2343.69482421875, "completions/mean_terminated_length": 1778.4642333984375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.12580247720082602, "epoch": 0.25567423230974634, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.7335965417324955e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 917272639.0, "reward": 0.6723805069923401, "reward_std": 0.41620245575904846, "rewards/TRLRewardAdapter/mean": 0.6723804473876953, "rewards/TRLRewardAdapter/std": 0.41620245575904846, "sampling/importance_sampling_ratio/max": 2.5427374839782715, "sampling/importance_sampling_ratio/mean": 0.1992255300283432, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.75, "sampling/sampling_logp_difference/mean": 0.017099758610129356, "step": 383, "step_time": 362.0799884790322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01666666753590107, "completions/max_length": 10000.0, "completions/max_terminated_length": 9733.0, "completions/mean_length": 2166.1689453125, "completions/mean_terminated_length": 2033.3919677734375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.1513983036080996, "epoch": 0.25634178905206945, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0001044314853625996, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 919787809.0, "reward": 0.6603935956954956, "reward_std": 0.4078635573387146, "rewards/TRLRewardAdapter/mean": 0.6603935956954956, "rewards/TRLRewardAdapter/std": 0.4078635573387146, "sampling/importance_sampling_ratio/max": 1.8395601511001587, "sampling/importance_sampling_ratio/mean": 0.12045484781265259, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.3449835777282715, "sampling/sampling_logp_difference/mean": 0.020258335396647453, "step": 384, "step_time": 222.23596003605053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9855.0, "completions/mean_length": 2118.058349609375, "completions/mean_terminated_length": 1784.29541015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.16176308939854303, "epoch": 0.2570093457943925, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 2.125512464092648e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 922262457.0, "reward": 0.7059398889541626, "reward_std": 0.3826860785484314, "rewards/TRLRewardAdapter/mean": 0.7059398889541626, "rewards/TRLRewardAdapter/std": 0.3826860785484314, "sampling/importance_sampling_ratio/max": 1.4399378299713135, "sampling/importance_sampling_ratio/mean": 0.1270807981491089, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.5, "sampling/sampling_logp_difference/mean": 0.022135494276881218, "step": 385, "step_time": 340.00093251198996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9991.0, "completions/mean_length": 1106.1761474609375, "completions/mean_terminated_length": 936.2304077148438, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.12578571711977324, "epoch": 0.2576769025367156, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0003862498016048056, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 923742530.0, "reward": 0.7577168941497803, "reward_std": 0.38164034485816956, "rewards/TRLRewardAdapter/mean": 0.7577168941497803, "rewards/TRLRewardAdapter/std": 0.38164031505584717, "sampling/importance_sampling_ratio/max": 2.0448668003082275, "sampling/importance_sampling_ratio/mean": 0.23582319915294647, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.356597900390625, "sampling/sampling_logp_difference/mean": 0.017740121111273766, "step": 386, "step_time": 245.78766402206384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333395421505, "completions/max_length": 10000.0, "completions/max_terminated_length": 9858.0, "completions/mean_length": 1883.1083984375, "completions/mean_terminated_length": 1710.408447265625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.12339989965160687, "epoch": 0.2583444592790387, "frac_reward_zero_std": 0.0, "grad_norm": 0.0005271753104231166, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 925978186.0, "reward": 0.7178963422775269, "reward_std": 0.3935711681842804, "rewards/TRLRewardAdapter/mean": 0.7178963422775269, "rewards/TRLRewardAdapter/std": 0.3935711681842804, "sampling/importance_sampling_ratio/max": 2.543313980102539, "sampling/importance_sampling_ratio/mean": 0.1394425332546234, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.625, "sampling/sampling_logp_difference/mean": 0.0175404604524374, "step": 387, "step_time": 271.5617952479515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9708.0, "completions/mean_length": 2281.571044921875, "completions/mean_terminated_length": 2006.8046875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.13645566999912262, "epoch": 0.2590120160213618, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0001619302197703405, "learning_rate": 5e-06, "loss": 0.0026, "num_tokens": 928601582.0, "reward": 0.6870055198669434, "reward_std": 0.381174772977829, "rewards/TRLRewardAdapter/mean": 0.6870054602622986, "rewards/TRLRewardAdapter/std": 0.3811747431755066, "sampling/importance_sampling_ratio/max": 1.665347695350647, "sampling/importance_sampling_ratio/mean": 0.11203374713659286, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.969563007354736, "sampling/sampling_logp_difference/mean": 0.01872081868350506, "step": 388, "step_time": 235.77637089486234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08437500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9429.0, "completions/mean_length": 2474.157470703125, "completions/mean_terminated_length": 1780.6495361328125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.14102640251318613, "epoch": 0.2596795727636849, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00040214383596269803, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 931376133.0, "reward": 0.675704300403595, "reward_std": 0.3934566080570221, "rewards/TRLRewardAdapter/mean": 0.6757042407989502, "rewards/TRLRewardAdapter/std": 0.3934566378593445, "sampling/importance_sampling_ratio/max": 1.6078296899795532, "sampling/importance_sampling_ratio/mean": 0.12566161155700684, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.291631698608398, "sampling/sampling_logp_difference/mean": 0.019310200586915016, "step": 389, "step_time": 354.6826764991274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 8384.0, "completions/mean_length": 1588.2542724609375, "completions/mean_terminated_length": 1570.693115234375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.15610901017983755, "epoch": 0.260347129506008, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.85816668116058e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 933342681.0, "reward": 0.7829093337059021, "reward_std": 0.3053264617919922, "rewards/TRLRewardAdapter/mean": 0.7829092741012573, "rewards/TRLRewardAdapter/std": 0.3053264319896698, "sampling/importance_sampling_ratio/max": 1.8912959098815918, "sampling/importance_sampling_ratio/mean": 0.1410655826330185, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.59375, "sampling/sampling_logp_difference/mean": 0.020688354969024658, "step": 390, "step_time": 139.5501988498727 }, { "epoch": 0.260347129506008, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.026739129594162754, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9241.04347826087, "eval_completions/mean_length": 1450.6160570227582, "eval_completions/mean_terminated_length": 1215.786894425102, "eval_completions/min_length": 50.04347826086956, "eval_completions/min_terminated_length": 50.04347826086956, "eval_entropy": 0.1663537200378335, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 933342681.0, "eval_reward": 0.6423870921134949, "eval_reward_std": 0.4229615771252176, "eval_rewards/TRLRewardAdapter/mean": 0.6423870947050012, "eval_rewards/TRLRewardAdapter/std": 0.4229615745337113, "eval_runtime": 1417.8597, "eval_samples_per_second": 3.22, "eval_sampling/importance_sampling_ratio/max": 1.7976010364034902, "eval_sampling/importance_sampling_ratio/mean": 0.2982616826244023, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 6.935947972795238, "eval_sampling/sampling_logp_difference/mean": 0.0217001650320447, "eval_steps_per_second": 0.016, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02500000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9970.0, "completions/mean_length": 2086.705322265625, "completions/mean_terminated_length": 1883.80029296875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.15016180773576102, "epoch": 0.26101468624833113, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.000337952950132825, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 935793182.0, "reward": 0.668552041053772, "reward_std": 0.40904638171195984, "rewards/TRLRewardAdapter/mean": 0.668552041053772, "rewards/TRLRewardAdapter/std": 0.40904635190963745, "sampling/importance_sampling_ratio/max": 1.8202277421951294, "sampling/importance_sampling_ratio/mean": 0.14552481472492218, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.056854248046875, "sampling/sampling_logp_difference/mean": 0.02067953161895275, "step": 391, "step_time": 308.52002315595746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0989583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9789.0, "completions/mean_length": 2712.987548828125, "completions/mean_terminated_length": 1912.6796875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1316201314330101, "epoch": 0.2616822429906542, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00021804476138400904, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 938860562.0, "reward": 0.7085332870483398, "reward_std": 0.37272319197654724, "rewards/TRLRewardAdapter/mean": 0.7085332274436951, "rewards/TRLRewardAdapter/std": 0.37272319197654724, "sampling/importance_sampling_ratio/max": 1.511287808418274, "sampling/importance_sampling_ratio/mean": 0.12070278823375702, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.75, "sampling/sampling_logp_difference/mean": 0.017771996557712555, "step": 392, "step_time": 391.37035794905387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08125000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9984.0, "completions/mean_length": 2992.424072265625, "completions/mean_terminated_length": 2372.706298828125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.17508464554945627, "epoch": 0.2623497997329773, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 8.267425452104245e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 942196681.0, "reward": 0.629449725151062, "reward_std": 0.39736807346343994, "rewards/TRLRewardAdapter/mean": 0.629449725151062, "rewards/TRLRewardAdapter/std": 0.39736807346343994, "sampling/importance_sampling_ratio/max": 1.1490951776504517, "sampling/importance_sampling_ratio/mean": 0.044362571090459824, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.248069763183594, "sampling/sampling_logp_difference/mean": 0.022919822484254837, "step": 393, "step_time": 396.4554894999601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1354166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9949.0, "completions/mean_length": 3714.234619140625, "completions/mean_terminated_length": 2729.717041015625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.17053831120332083, "epoch": 0.2630173564753004, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 1.0191835025985177e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 946243114.0, "reward": 0.559101939201355, "reward_std": 0.4031083285808563, "rewards/TRLRewardAdapter/mean": 0.559101939201355, "rewards/TRLRewardAdapter/std": 0.4031083285808563, "sampling/importance_sampling_ratio/max": 2.4719746112823486, "sampling/importance_sampling_ratio/mean": 0.07111819088459015, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.12621307373047, "sampling/sampling_logp_difference/mean": 0.0217319056391716, "step": 394, "step_time": 376.9579858472571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9737.0, "completions/mean_length": 2014.3948974609375, "completions/mean_terminated_length": 1955.7386474609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.17502794166405997, "epoch": 0.2636849132176235, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.473735109308606e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 948618853.0, "reward": 0.7732231616973877, "reward_std": 0.29242008924484253, "rewards/TRLRewardAdapter/mean": 0.7732231020927429, "rewards/TRLRewardAdapter/std": 0.29242008924484253, "sampling/importance_sampling_ratio/max": 2.2563130855560303, "sampling/importance_sampling_ratio/mean": 0.05922776460647583, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.0, "sampling/sampling_logp_difference/mean": 0.023191016167402267, "step": 395, "step_time": 223.79521680087782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04270833730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9738.0, "completions/mean_length": 2216.42724609375, "completions/mean_terminated_length": 1869.1729736328125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.14131479958693186, "epoch": 0.2643524699599466, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.38100713433684e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 951216799.0, "reward": 0.7581943869590759, "reward_std": 0.3328278064727783, "rewards/TRLRewardAdapter/mean": 0.7581943273544312, "rewards/TRLRewardAdapter/std": 0.33282777667045593, "sampling/importance_sampling_ratio/max": 1.7143505811691284, "sampling/importance_sampling_ratio/mean": 0.13557863235473633, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.0, "sampling/sampling_logp_difference/mean": 0.019376251846551895, "step": 396, "step_time": 324.43397783406544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9436.0, "completions/mean_length": 1801.3719482421875, "completions/mean_terminated_length": 1671.2349853515625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17197711020708084, "epoch": 0.2650200267022697, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.665604888260225e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 953389604.0, "reward": 0.7762883901596069, "reward_std": 0.30502769351005554, "rewards/TRLRewardAdapter/mean": 0.7762883305549622, "rewards/TRLRewardAdapter/std": 0.30502772331237793, "sampling/importance_sampling_ratio/max": 1.3915154933929443, "sampling/importance_sampling_ratio/mean": 0.09827818721532822, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.24807071685791, "sampling/sampling_logp_difference/mean": 0.022552315145730972, "step": 397, "step_time": 288.3731270239223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07500000298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9062.0, "completions/mean_length": 2054.25439453125, "completions/mean_terminated_length": 1410.0045166015625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.12804735700289407, "epoch": 0.2656875834445928, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.026070074831449e-05, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 955811032.0, "reward": 0.7290059924125671, "reward_std": 0.37476101517677307, "rewards/TRLRewardAdapter/mean": 0.7290059328079224, "rewards/TRLRewardAdapter/std": 0.37476104497909546, "sampling/importance_sampling_ratio/max": 1.7840882539749146, "sampling/importance_sampling_ratio/mean": 0.11578605324029922, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.134021759033203, "sampling/sampling_logp_difference/mean": 0.017632098868489265, "step": 398, "step_time": 393.6341925120214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09583333879709244, "completions/max_length": 10000.0, "completions/max_terminated_length": 9901.0, "completions/mean_length": 2885.638671875, "completions/mean_terminated_length": 2131.581787109375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.15518027544021606, "epoch": 0.26635514018691586, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.1584914518724495e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 959074301.0, "reward": 0.6443060636520386, "reward_std": 0.3824462592601776, "rewards/TRLRewardAdapter/mean": 0.6443060636520386, "rewards/TRLRewardAdapter/std": 0.3824462592601776, "sampling/importance_sampling_ratio/max": 2.082005262374878, "sampling/importance_sampling_ratio/mean": 0.07381443679332733, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.0, "sampling/sampling_logp_difference/mean": 0.02014206163585186, "step": 399, "step_time": 391.6560095989844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07395833730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9904.0, "completions/mean_length": 2783.580322265625, "completions/mean_terminated_length": 2207.24072265625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.16383168349663416, "epoch": 0.26702269692923897, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.927021094112977e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 962170442.0, "reward": 0.6765222549438477, "reward_std": 0.38263800740242004, "rewards/TRLRewardAdapter/mean": 0.6765221953392029, "rewards/TRLRewardAdapter/std": 0.38263797760009766, "sampling/importance_sampling_ratio/max": 0.9627533555030823, "sampling/importance_sampling_ratio/mean": 0.05882517248392105, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.5, "sampling/sampling_logp_difference/mean": 0.02149171195924282, "step": 400, "step_time": 359.0179472381715 }, { "epoch": 0.26702269692923897, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.030869564324941322, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 9216.434782608696, "eval_completions/mean_length": 1360.754102623981, "eval_completions/mean_terminated_length": 1085.8644674549932, "eval_completions/min_length": 49.17391304347826, "eval_completions/min_terminated_length": 49.17391304347826, "eval_entropy": 0.16026884446973386, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 962170442.0, "eval_reward": 0.6105995074562405, "eval_reward_std": 0.4395737324071967, "eval_rewards/TRLRewardAdapter/mean": 0.6105995255967845, "eval_rewards/TRLRewardAdapter/std": 0.43957373370294983, "eval_runtime": 1407.7478, "eval_samples_per_second": 3.243, "eval_sampling/importance_sampling_ratio/max": 2.0015333165293154, "eval_sampling/importance_sampling_ratio/mean": 0.3261256904705711, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 8.257099167160366, "eval_sampling/sampling_logp_difference/mean": 0.021078407845419388, "eval_steps_per_second": 0.016, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0677083358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9774.0, "completions/mean_length": 2200.77099609375, "completions/mean_terminated_length": 1634.3463134765625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.14611802250146866, "epoch": 0.2676902536715621, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 6.639497654919898e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 964729710.0, "reward": 0.7407668232917786, "reward_std": 0.3499545454978943, "rewards/TRLRewardAdapter/mean": 0.7407667636871338, "rewards/TRLRewardAdapter/std": 0.3499545156955719, "sampling/importance_sampling_ratio/max": 1.8409531116485596, "sampling/importance_sampling_ratio/mean": 0.1157359853386879, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.8068528175354, "sampling/sampling_logp_difference/mean": 0.01971454732120037, "step": 401, "step_time": 381.1008231488522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9897.0, "completions/mean_length": 2185.182373046875, "completions/mean_terminated_length": 1959.0299072265625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.16746549059947333, "epoch": 0.2683578104138852, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.980767541501329e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 967351325.0, "reward": 0.7581387162208557, "reward_std": 0.3126850426197052, "rewards/TRLRewardAdapter/mean": 0.7581386566162109, "rewards/TRLRewardAdapter/std": 0.3126850426197052, "sampling/importance_sampling_ratio/max": 2.1669907569885254, "sampling/importance_sampling_ratio/mean": 0.11185529083013535, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.25, "sampling/sampling_logp_difference/mean": 0.022351957857608795, "step": 402, "step_time": 347.2106857220642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09270834177732468, "completions/max_length": 10000.0, "completions/max_terminated_length": 9658.0, "completions/mean_length": 2964.775146484375, "completions/mean_terminated_length": 2245.90576171875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.16202715039253235, "epoch": 0.2690253671562083, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 6.908802012653152e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 970623941.0, "reward": 0.6669699549674988, "reward_std": 0.37962353229522705, "rewards/TRLRewardAdapter/mean": 0.6669699549674988, "rewards/TRLRewardAdapter/std": 0.37962353229522705, "sampling/importance_sampling_ratio/max": 1.5657083988189697, "sampling/importance_sampling_ratio/mean": 0.0983724370598793, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.101531982421875, "sampling/sampling_logp_difference/mean": 0.021145585924386978, "step": 403, "step_time": 383.6222748470027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9853.0, "completions/mean_length": 1710.7938232421875, "completions/mean_terminated_length": 1588.1204833984375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.13176462550957999, "epoch": 0.2696929238985314, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.4318131838482612e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 972673599.0, "reward": 0.8085875511169434, "reward_std": 0.28127777576446533, "rewards/TRLRewardAdapter/mean": 0.8085874915122986, "rewards/TRLRewardAdapter/std": 0.28127777576446533, "sampling/importance_sampling_ratio/max": 2.368854522705078, "sampling/importance_sampling_ratio/mean": 0.1496104747056961, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.9077272415161133, "sampling/sampling_logp_difference/mean": 0.01796255074441433, "step": 404, "step_time": 293.5846458537271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9886.0, "completions/mean_length": 2007.712646484375, "completions/mean_terminated_length": 1767.600830078125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.1441312531630198, "epoch": 0.2703604806408545, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00024781995780243, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 975041739.0, "reward": 0.7064379453659058, "reward_std": 0.3841613531112671, "rewards/TRLRewardAdapter/mean": 0.7064379453659058, "rewards/TRLRewardAdapter/std": 0.3841613531112671, "sampling/importance_sampling_ratio/max": 1.5672305822372437, "sampling/importance_sampling_ratio/mean": 0.10929770767688751, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.41016960144043, "sampling/sampling_logp_difference/mean": 0.01968231610953808, "step": 405, "step_time": 339.1389740679879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013541667722165585, "completions/max_length": 10000.0, "completions/max_terminated_length": 9795.0, "completions/mean_length": 1765.4698486328125, "completions/mean_terminated_length": 1652.4298095703125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.13180537770191827, "epoch": 0.27102803738317754, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.1950340850529255e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 977185646.0, "reward": 0.7725633978843689, "reward_std": 0.32077792286872864, "rewards/TRLRewardAdapter/mean": 0.7725633382797241, "rewards/TRLRewardAdapter/std": 0.32077792286872864, "sampling/importance_sampling_ratio/max": 2.79819655418396, "sampling/importance_sampling_ratio/mean": 0.10723326355218887, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.03125, "sampling/sampling_logp_difference/mean": 0.017820317298173904, "step": 406, "step_time": 290.0956874979893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06145833805203438, "completions/max_length": 10000.0, "completions/max_terminated_length": 9927.0, "completions/mean_length": 2984.994873046875, "completions/mean_terminated_length": 2525.6328125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.1766236275434494, "epoch": 0.27169559412550065, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.0341662534642958e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 980526761.0, "reward": 0.6100820302963257, "reward_std": 0.4067338705062866, "rewards/TRLRewardAdapter/mean": 0.6100819706916809, "rewards/TRLRewardAdapter/std": 0.4067338705062866, "sampling/importance_sampling_ratio/max": 1.7174222469329834, "sampling/importance_sampling_ratio/mean": 0.07787225395441055, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.25, "sampling/sampling_logp_difference/mean": 0.022664804011583328, "step": 407, "step_time": 381.98086314089596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9905.0, "completions/mean_length": 1684.2021484375, "completions/mean_terminated_length": 1452.7130126953125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.14318853865067163, "epoch": 0.27236315086782376, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 0.00036408703151548437, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 982565643.0, "reward": 0.7209904193878174, "reward_std": 0.386615127325058, "rewards/TRLRewardAdapter/mean": 0.7209903597831726, "rewards/TRLRewardAdapter/std": 0.386615127325058, "sampling/importance_sampling_ratio/max": 2.50241756439209, "sampling/importance_sampling_ratio/mean": 0.11275880038738251, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.65625, "sampling/sampling_logp_difference/mean": 0.01986583136022091, "step": 408, "step_time": 347.3044920390239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9293.0, "completions/mean_length": 2204.367919921875, "completions/mean_terminated_length": 1900.6417236328125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.15813934803009033, "epoch": 0.27303070761014686, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.8221699382844762e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 985138060.0, "reward": 0.7356215119361877, "reward_std": 0.3306973874568939, "rewards/TRLRewardAdapter/mean": 0.735621452331543, "rewards/TRLRewardAdapter/std": 0.3306973874568939, "sampling/importance_sampling_ratio/max": 2.7405922412872314, "sampling/importance_sampling_ratio/mean": 0.06565914303064346, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.20545196533203, "sampling/sampling_logp_difference/mean": 0.02095605805516243, "step": 409, "step_time": 377.63309240702074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0677083358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9984.0, "completions/mean_length": 2134.904296875, "completions/mean_terminated_length": 1563.696044921875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.16594761361678442, "epoch": 0.27369826435246997, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 8.741283500959033e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 987624016.0, "reward": 0.7235954403877258, "reward_std": 0.3731254041194916, "rewards/TRLRewardAdapter/mean": 0.7235954403877258, "rewards/TRLRewardAdapter/std": 0.3731254041194916, "sampling/importance_sampling_ratio/max": 1.1907539367675781, "sampling/importance_sampling_ratio/mean": 0.09292031824588776, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.0, "sampling/sampling_logp_difference/mean": 0.02109154686331749, "step": 410, "step_time": 390.1976270779269 }, { "epoch": 0.27369826435246997, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0254347819187071, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8709.347826086956, "eval_completions/mean_length": 1175.0382265837295, "eval_completions/mean_terminated_length": 944.4248551078465, "eval_completions/min_length": 48.52173913043478, "eval_completions/min_terminated_length": 48.52173913043478, "eval_entropy": 0.15156039789966916, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 987624016.0, "eval_reward": 0.5887503572132277, "eval_reward_std": 0.4525719595992047, "eval_rewards/TRLRewardAdapter/mean": 0.5887503649877466, "eval_rewards/TRLRewardAdapter/std": 0.45257195830345154, "eval_runtime": 1394.1299, "eval_samples_per_second": 3.275, "eval_sampling/importance_sampling_ratio/max": 1.9533383379811826, "eval_sampling/importance_sampling_ratio/mean": 0.38132030549256696, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.986089903375376, "eval_sampling/sampling_logp_difference/mean": 0.020180312916636467, "eval_steps_per_second": 0.016, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9789.0, "completions/mean_length": 2437.878173828125, "completions/mean_terminated_length": 2160.21923828125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.15434736758470535, "epoch": 0.27436582109479307, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.5926855163843807e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 990468731.0, "reward": 0.6518151760101318, "reward_std": 0.39131486415863037, "rewards/TRLRewardAdapter/mean": 0.6518151760101318, "rewards/TRLRewardAdapter/std": 0.39131489396095276, "sampling/importance_sampling_ratio/max": 1.3977112770080566, "sampling/importance_sampling_ratio/mean": 0.14134684205055237, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.193679809570312, "sampling/sampling_logp_difference/mean": 0.020641865208745003, "step": 411, "step_time": 364.6299540338805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9930.0, "completions/mean_length": 2153.855224609375, "completions/mean_terminated_length": 1848.1612548828125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.12206229070822398, "epoch": 0.2750333778371162, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.2122712200836976e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 992959408.0, "reward": 0.7532145380973816, "reward_std": 0.33592459559440613, "rewards/TRLRewardAdapter/mean": 0.7532145380973816, "rewards/TRLRewardAdapter/std": 0.33592459559440613, "sampling/importance_sampling_ratio/max": 1.700029969215393, "sampling/importance_sampling_ratio/mean": 0.12136764824390411, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.887248039245605, "sampling/sampling_logp_difference/mean": 0.016496261581778526, "step": 412, "step_time": 294.22386969823856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06354167312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9705.0, "completions/mean_length": 1970.7667236328125, "completions/mean_terminated_length": 1425.9576416015625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.1632708559433619, "epoch": 0.2757009345794392, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.645914531594319e-05, "learning_rate": 5e-06, "loss": 0.0009, "num_tokens": 995285936.0, "reward": 0.7726653218269348, "reward_std": 0.3186584711074829, "rewards/TRLRewardAdapter/mean": 0.77266526222229, "rewards/TRLRewardAdapter/std": 0.3186584711074829, "sampling/importance_sampling_ratio/max": 1.1842550039291382, "sampling/importance_sampling_ratio/mean": 0.08007583767175674, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.0, "sampling/sampling_logp_difference/mean": 0.02121994085609913, "step": 413, "step_time": 374.91678724985104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 8203.0, "completions/mean_length": 1652.7177734375, "completions/mean_terminated_length": 1420.3521728515625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.14093205084403357, "epoch": 0.27636849132176233, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0001646008496378371, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 997245121.0, "reward": 0.7620292901992798, "reward_std": 0.34299641847610474, "rewards/TRLRewardAdapter/mean": 0.7620292901992798, "rewards/TRLRewardAdapter/std": 0.3429964482784271, "sampling/importance_sampling_ratio/max": 0.9462296962738037, "sampling/importance_sampling_ratio/mean": 0.056335847824811935, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.844983100891113, "sampling/sampling_logp_difference/mean": 0.01917768456041813, "step": 414, "step_time": 322.16713806882035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9817.0, "completions/max_terminated_length": 9817.0, "completions/mean_length": 2085.140625, "completions/mean_terminated_length": 2085.140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.13451137642065683, "epoch": 0.27703604806408544, "frac_reward_zero_std": 0.0, "grad_norm": 9.969909391895965e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 999730472.0, "reward": 0.7757638096809387, "reward_std": 0.28308799862861633, "rewards/TRLRewardAdapter/mean": 0.775763750076294, "rewards/TRLRewardAdapter/std": 0.28308799862861633, "sampling/importance_sampling_ratio/max": 2.076106071472168, "sampling/importance_sampling_ratio/mean": 0.13609851896762848, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.637249946594238, "sampling/sampling_logp_difference/mean": 0.018313398584723473, "step": 415, "step_time": 197.6264984671725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07187500596046448, "completions/max_length": 10000.0, "completions/max_terminated_length": 9997.0, "completions/mean_length": 2428.841796875, "completions/mean_terminated_length": 1842.5230712890625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1639251708984375, "epoch": 0.27770360480640854, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00011642205021780068, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1002471248.0, "reward": 0.7048565149307251, "reward_std": 0.38382983207702637, "rewards/TRLRewardAdapter/mean": 0.7048565149307251, "rewards/TRLRewardAdapter/std": 0.38382983207702637, "sampling/importance_sampling_ratio/max": 1.5904319286346436, "sampling/importance_sampling_ratio/mean": 0.12232930958271027, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.53516960144043, "sampling/sampling_logp_difference/mean": 0.021794654428958893, "step": 416, "step_time": 303.2557416759664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9451.0, "completions/mean_length": 2099.602294921875, "completions/mean_terminated_length": 1756.1064453125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1544155776500702, "epoch": 0.27837116154873165, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.827698018343941e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1004936434.0, "reward": 0.7461509108543396, "reward_std": 0.33451542258262634, "rewards/TRLRewardAdapter/mean": 0.7461508512496948, "rewards/TRLRewardAdapter/std": 0.33451542258262634, "sampling/importance_sampling_ratio/max": 1.1950381994247437, "sampling/importance_sampling_ratio/mean": 0.07465216517448425, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.887245178222656, "sampling/sampling_logp_difference/mean": 0.02035471796989441, "step": 417, "step_time": 327.0528466090327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 8690.0, "completions/mean_length": 1475.784423828125, "completions/mean_terminated_length": 1376.9788818359375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.13591650873422623, "epoch": 0.27903871829105475, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.033884770630608e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1006824067.0, "reward": 0.8221532702445984, "reward_std": 0.26744791865348816, "rewards/TRLRewardAdapter/mean": 0.8221532106399536, "rewards/TRLRewardAdapter/std": 0.26744791865348816, "sampling/importance_sampling_ratio/max": 2.4354710578918457, "sampling/importance_sampling_ratio/mean": 0.1379142850637436, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.3449835777282715, "sampling/sampling_logp_difference/mean": 0.018972687423229218, "step": 418, "step_time": 267.8548329808982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 9882.0, "completions/mean_length": 2310.94189453125, "completions/mean_terminated_length": 2062.907470703125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.1483866274356842, "epoch": 0.27970627503337786, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0003909318342495618, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 1009483115.0, "reward": 0.6879848837852478, "reward_std": 0.3791978061199188, "rewards/TRLRewardAdapter/mean": 0.687984824180603, "rewards/TRLRewardAdapter/std": 0.37919777631759644, "sampling/importance_sampling_ratio/max": 1.0974615812301636, "sampling/importance_sampling_ratio/mean": 0.04428809508681297, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.5, "sampling/sampling_logp_difference/mean": 0.019995903596282005, "step": 419, "step_time": 249.74647208792157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9648.0, "completions/mean_length": 2509.938720703125, "completions/mean_terminated_length": 2366.81640625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.15841978043317795, "epoch": 0.2803738317757009, "frac_reward_zero_std": 0.0, "grad_norm": 3.734606442845393e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1012338224.0, "reward": 0.7217106223106384, "reward_std": 0.29967164993286133, "rewards/TRLRewardAdapter/mean": 0.7217105627059937, "rewards/TRLRewardAdapter/std": 0.29967164993286133, "sampling/importance_sampling_ratio/max": 2.8210153579711914, "sampling/importance_sampling_ratio/mean": 0.07853854447603226, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.75, "sampling/sampling_logp_difference/mean": 0.020767051726579666, "step": 420, "step_time": 254.25566426618025 }, { "epoch": 0.2803738317757009, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.021304347228420818, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8956.91304347826, "eval_completions/mean_length": 1016.0084626570991, "eval_completions/mean_terminated_length": 820.3952291737432, "eval_completions/min_length": 48.43478260869565, "eval_completions/min_terminated_length": 48.43478260869565, "eval_entropy": 0.14185479607271112, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1012338224.0, "eval_reward": 0.5690510013829106, "eval_reward_std": 0.46317493138105975, "eval_rewards/TRLRewardAdapter/mean": 0.5690510117489359, "eval_rewards/TRLRewardAdapter/std": 0.46317493785982544, "eval_runtime": 1379.0543, "eval_samples_per_second": 3.311, "eval_sampling/importance_sampling_ratio/max": 2.0219175608261772, "eval_sampling/importance_sampling_ratio/mean": 0.41673310554545856, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 8.008140040480573, "eval_sampling/sampling_logp_difference/mean": 0.019339308790538624, "eval_steps_per_second": 0.017, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05520833656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9992.0, "completions/mean_length": 2317.682373046875, "completions/mean_terminated_length": 1868.770751953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.152202899257342, "epoch": 0.281041388518024, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 4.11649488188538e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1015083775.0, "reward": 0.7044776082038879, "reward_std": 0.3585164546966553, "rewards/TRLRewardAdapter/mean": 0.7044775485992432, "rewards/TRLRewardAdapter/std": 0.3585164546966553, "sampling/importance_sampling_ratio/max": 1.4163119792938232, "sampling/importance_sampling_ratio/mean": 0.1181924119591713, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.156776428222656, "sampling/sampling_logp_difference/mean": 0.020426075905561447, "step": 421, "step_time": 371.2534849351505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05937500298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9968.0, "completions/mean_length": 2307.058349609375, "completions/mean_terminated_length": 1821.4573974609375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.13785811265309653, "epoch": 0.2817089452603471, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.1231267703821755e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1017740087.0, "reward": 0.7226883172988892, "reward_std": 0.3642849624156952, "rewards/TRLRewardAdapter/mean": 0.7226883172988892, "rewards/TRLRewardAdapter/std": 0.3642849624156952, "sampling/importance_sampling_ratio/max": 1.6585584878921509, "sampling/importance_sampling_ratio/mean": 0.15118804574012756, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.36874771118164, "sampling/sampling_logp_difference/mean": 0.01871716044843197, "step": 422, "step_time": 370.9003166531911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9982.0, "completions/mean_length": 1467.5677490234375, "completions/mean_terminated_length": 1201.7884521484375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.13273724913597107, "epoch": 0.2823765020026702, "frac_reward_zero_std": 0.0, "grad_norm": 1.2911630226932836e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1019526136.0, "reward": 0.7962033748626709, "reward_std": 0.3106580972671509, "rewards/TRLRewardAdapter/mean": 0.7962033748626709, "rewards/TRLRewardAdapter/std": 0.3106580674648285, "sampling/importance_sampling_ratio/max": 1.7415729761123657, "sampling/importance_sampling_ratio/mean": 0.12137118726968765, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.65625, "sampling/sampling_logp_difference/mean": 0.01802809163928032, "step": 423, "step_time": 375.1692789949011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9900.0, "completions/mean_length": 2783.85546875, "completions/mean_terminated_length": 2302.779052734375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.15752914299567541, "epoch": 0.28304405874499333, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 6.202810614675275e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1022687117.0, "reward": 0.5960420966148376, "reward_std": 0.43868494033813477, "rewards/TRLRewardAdapter/mean": 0.5960420370101929, "rewards/TRLRewardAdapter/std": 0.43868494033813477, "sampling/importance_sampling_ratio/max": 1.7352423667907715, "sampling/importance_sampling_ratio/mean": 0.0915873795747757, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.5, "sampling/sampling_logp_difference/mean": 0.021091794595122337, "step": 424, "step_time": 399.0264687628951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666883975267, "completions/max_length": 10000.0, "completions/max_terminated_length": 9991.0, "completions/mean_length": 1617.9073486328125, "completions/mean_terminated_length": 1582.835693359375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.12652760495742163, "epoch": 0.28371161548731644, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.4878113577376164e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1024720372.0, "reward": 0.7811723947525024, "reward_std": 0.3302939534187317, "rewards/TRLRewardAdapter/mean": 0.7811723947525024, "rewards/TRLRewardAdapter/std": 0.3302939534187317, "sampling/importance_sampling_ratio/max": 2.247945785522461, "sampling/importance_sampling_ratio/mean": 0.19393962621688843, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.5, "sampling/sampling_logp_difference/mean": 0.017858104780316353, "step": 425, "step_time": 239.27856384404004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06041666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9945.0, "completions/mean_length": 2382.06689453125, "completions/mean_terminated_length": 1892.2216796875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.141318346063296, "epoch": 0.28437917222963954, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.785782772239021e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1027405428.0, "reward": 0.693681538105011, "reward_std": 0.37878090143203735, "rewards/TRLRewardAdapter/mean": 0.693681538105011, "rewards/TRLRewardAdapter/std": 0.37878093123435974, "sampling/importance_sampling_ratio/max": 2.1549086570739746, "sampling/importance_sampling_ratio/mean": 0.11494819819927216, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.25, "sampling/sampling_logp_difference/mean": 0.018982084468007088, "step": 426, "step_time": 375.53750344295986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06145833805203438, "completions/max_length": 10000.0, "completions/max_terminated_length": 9798.0, "completions/mean_length": 2086.901123046875, "completions/mean_terminated_length": 1568.729248046875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.16476844251155853, "epoch": 0.2850467289719626, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 9.536870449060761e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1029832437.0, "reward": 0.7288017868995667, "reward_std": 0.36465123295783997, "rewards/TRLRewardAdapter/mean": 0.7288017272949219, "rewards/TRLRewardAdapter/std": 0.36465123295783997, "sampling/importance_sampling_ratio/max": 1.233811616897583, "sampling/importance_sampling_ratio/mean": 0.08492270112037659, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.5, "sampling/sampling_logp_difference/mean": 0.020242679864168167, "step": 427, "step_time": 317.4531303399708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9657.0, "completions/mean_length": 1336.050048828125, "completions/mean_terminated_length": 1254.0567626953125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.14349542061487833, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.949736152276316e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1031554917.0, "reward": 0.8104223012924194, "reward_std": 0.3054373860359192, "rewards/TRLRewardAdapter/mean": 0.8104222416877747, "rewards/TRLRewardAdapter/std": 0.3054373860359192, "sampling/importance_sampling_ratio/max": 2.3019700050354004, "sampling/importance_sampling_ratio/mean": 0.15856459736824036, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.375814437866211, "sampling/sampling_logp_difference/mean": 0.019741246476769447, "step": 428, "step_time": 249.96828323916998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 8760.0, "completions/mean_length": 1523.7646484375, "completions/mean_terminated_length": 1297.1273193359375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.14647366851568222, "epoch": 0.2863818424566088, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.1606818298727233e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1033461603.0, "reward": 0.8183388113975525, "reward_std": 0.2743135988712311, "rewards/TRLRewardAdapter/mean": 0.8183387517929077, "rewards/TRLRewardAdapter/std": 0.2743135988712311, "sampling/importance_sampling_ratio/max": 1.3489980697631836, "sampling/importance_sampling_ratio/mean": 0.1166277676820755, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.0899658203125, "sampling/sampling_logp_difference/mean": 0.020077239722013474, "step": 429, "step_time": 348.6719399410067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9834.0, "completions/mean_length": 1647.806396484375, "completions/mean_terminated_length": 1396.8819580078125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.1363809828956922, "epoch": 0.2870493991989319, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 2.4324119716196374e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1035470857.0, "reward": 0.6872861385345459, "reward_std": 0.40567052364349365, "rewards/TRLRewardAdapter/mean": 0.6872861385345459, "rewards/TRLRewardAdapter/std": 0.40567052364349365, "sampling/importance_sampling_ratio/max": 2.71059513092041, "sampling/importance_sampling_ratio/mean": 0.09320861846208572, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.0, "sampling/sampling_logp_difference/mean": 0.018936743959784508, "step": 430, "step_time": 256.45466834306717 }, { "epoch": 0.2870493991989319, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.019565216913495376, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8454.260869565218, "eval_completions/mean_length": 938.8702021059783, "eval_completions/mean_terminated_length": 758.2232480256454, "eval_completions/min_length": 47.52173913043478, "eval_completions/min_terminated_length": 47.52173913043478, "eval_entropy": 0.14101981987123904, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1035470857.0, "eval_reward": 0.5718358651451443, "eval_reward_std": 0.46482246466304944, "eval_rewards/TRLRewardAdapter/mean": 0.5718358703281569, "eval_rewards/TRLRewardAdapter/std": 0.4648224737333215, "eval_runtime": 1377.6697, "eval_samples_per_second": 3.314, "eval_sampling/importance_sampling_ratio/max": 1.9318569069323333, "eval_sampling/importance_sampling_ratio/mean": 0.434755625932113, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 5.268201765806778, "eval_sampling/sampling_logp_difference/mean": 0.01915525418260823, "eval_steps_per_second": 0.017, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9974.0, "completions/mean_length": 2360.768798828125, "completions/mean_terminated_length": 2156.51123046875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.16346114873886108, "epoch": 0.287716955941255, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 2.7337753111009525e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1038192107.0, "reward": 0.6243202686309814, "reward_std": 0.3965376615524292, "rewards/TRLRewardAdapter/mean": 0.6243202090263367, "rewards/TRLRewardAdapter/std": 0.3965376615524292, "sampling/importance_sampling_ratio/max": 2.502486228942871, "sampling/importance_sampling_ratio/mean": 0.11559741199016571, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.601531982421875, "sampling/sampling_logp_difference/mean": 0.02147383987903595, "step": 431, "step_time": 357.9941860110266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07604166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9622.0, "completions/mean_length": 2526.826171875, "completions/mean_terminated_length": 1911.7845458984375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.13791124522686005, "epoch": 0.2883845126835781, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 9.563640395304006e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1041071460.0, "reward": 0.6321632266044617, "reward_std": 0.4194490909576416, "rewards/TRLRewardAdapter/mean": 0.6321631669998169, "rewards/TRLRewardAdapter/std": 0.4194490909576416, "sampling/importance_sampling_ratio/max": 2.664485216140747, "sampling/importance_sampling_ratio/mean": 0.11889753490686417, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.0, "sampling/sampling_logp_difference/mean": 0.019459381699562073, "step": 432, "step_time": 370.7090881409822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0677083358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9956.0, "completions/mean_length": 2660.242919921875, "completions/mean_terminated_length": 2127.187744140625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.16431138664484024, "epoch": 0.2890520694259012, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.797964085332968e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1044134509.0, "reward": 0.6348368525505066, "reward_std": 0.4011939465999603, "rewards/TRLRewardAdapter/mean": 0.6348368525505066, "rewards/TRLRewardAdapter/std": 0.4011939465999603, "sampling/importance_sampling_ratio/max": 1.0976961851119995, "sampling/importance_sampling_ratio/mean": 0.05484776943922043, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.45545196533203, "sampling/sampling_logp_difference/mean": 0.021542727947235107, "step": 433, "step_time": 300.984622546006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9941.0, "completions/mean_length": 2701.907470703125, "completions/mean_terminated_length": 2215.367919921875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.1585269719362259, "epoch": 0.2897196261682243, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 4.026110527643052e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1047213396.0, "reward": 0.5865404009819031, "reward_std": 0.41685375571250916, "rewards/TRLRewardAdapter/mean": 0.5865403413772583, "rewards/TRLRewardAdapter/std": 0.41685375571250916, "sampling/importance_sampling_ratio/max": 1.6978684663772583, "sampling/importance_sampling_ratio/mean": 0.12435837090015411, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.69087219238281, "sampling/sampling_logp_difference/mean": 0.021131372079253197, "step": 434, "step_time": 335.34367272700183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04375000298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9803.0, "completions/mean_length": 1791.331298828125, "completions/mean_terminated_length": 1415.771240234375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.13533909618854523, "epoch": 0.2903871829105474, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 9.796914802203965e-05, "learning_rate": 5e-06, "loss": 0.0012, "num_tokens": 1049396882.0, "reward": 0.7549695372581482, "reward_std": 0.3322158753871918, "rewards/TRLRewardAdapter/mean": 0.7549694776535034, "rewards/TRLRewardAdapter/std": 0.3322158753871918, "sampling/importance_sampling_ratio/max": 2.4988250732421875, "sampling/importance_sampling_ratio/mean": 0.1072368174791336, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.0, "sampling/sampling_logp_difference/mean": 0.01824800670146942, "step": 435, "step_time": 339.8566935042618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0520833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9615.0, "completions/mean_length": 2526.107421875, "completions/mean_terminated_length": 2115.453857421875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.17291886111100516, "epoch": 0.2910547396528705, "frac_reward_zero_std": 0.23333334922790527, "grad_norm": 0.00012586192575670386, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1052313593.0, "reward": 0.6181530356407166, "reward_std": 0.43599626421928406, "rewards/TRLRewardAdapter/mean": 0.6181529760360718, "rewards/TRLRewardAdapter/std": 0.43599623441696167, "sampling/importance_sampling_ratio/max": 1.5639766454696655, "sampling/importance_sampling_ratio/mean": 0.098874531686306, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.5, "sampling/sampling_logp_difference/mean": 0.02296399138867855, "step": 436, "step_time": 328.07485921296757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9223.0, "completions/mean_length": 1608.1812744140625, "completions/mean_terminated_length": 1420.5047607421875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.14653341472148895, "epoch": 0.2917222963951936, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0008032067892463516, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1054293735.0, "reward": 0.7335160374641418, "reward_std": 0.36981073021888733, "rewards/TRLRewardAdapter/mean": 0.7335159778594971, "rewards/TRLRewardAdapter/std": 0.36981070041656494, "sampling/importance_sampling_ratio/max": 1.7399553060531616, "sampling/importance_sampling_ratio/mean": 0.13991734385490417, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.0, "sampling/sampling_logp_difference/mean": 0.019785944372415543, "step": 437, "step_time": 198.26195538486354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9959.0, "completions/mean_length": 1793.701171875, "completions/mean_terminated_length": 1455.48046875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.1462513655424118, "epoch": 0.2923898531375167, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.3648445384772884e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1056518056.0, "reward": 0.7642224431037903, "reward_std": 0.344278484582901, "rewards/TRLRewardAdapter/mean": 0.7642223834991455, "rewards/TRLRewardAdapter/std": 0.344278484582901, "sampling/importance_sampling_ratio/max": 1.7580795288085938, "sampling/importance_sampling_ratio/mean": 0.1544979363679886, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5, "sampling/sampling_logp_difference/mean": 0.020020417869091034, "step": 438, "step_time": 349.43762900913134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 9970.0, "completions/mean_length": 1655.9385986328125, "completions/mean_terminated_length": 1386.7752685546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.13992980619271597, "epoch": 0.2930574098798398, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.374788770664219e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1058555565.0, "reward": 0.7428486347198486, "reward_std": 0.3629087209701538, "rewards/TRLRewardAdapter/mean": 0.7428485751152039, "rewards/TRLRewardAdapter/std": 0.3629086911678314, "sampling/importance_sampling_ratio/max": 2.5367181301116943, "sampling/importance_sampling_ratio/mean": 0.14650705456733704, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.806852340698242, "sampling/sampling_logp_difference/mean": 0.019994467496871948, "step": 439, "step_time": 374.1058148709126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 8687.0, "completions/mean_length": 1759.6053466796875, "completions/mean_terminated_length": 1457.0421142578125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1643164629737536, "epoch": 0.2937249666221629, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 1.1761337452268878e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1060665490.0, "reward": 0.6925541162490845, "reward_std": 0.3833613097667694, "rewards/TRLRewardAdapter/mean": 0.6925541162490845, "rewards/TRLRewardAdapter/std": 0.3833613395690918, "sampling/importance_sampling_ratio/max": 1.297868251800537, "sampling/importance_sampling_ratio/mean": 0.07822896540164948, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.884025573730469, "sampling/sampling_logp_difference/mean": 0.02169259637594223, "step": 440, "step_time": 305.6746244688984 }, { "epoch": 0.2937249666221629, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.01869565159406351, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8496.173913043478, "eval_completions/mean_length": 912.1893363620924, "eval_completions/mean_terminated_length": 739.0834735372792, "eval_completions/min_length": 47.47826086956522, "eval_completions/min_terminated_length": 47.47826086956522, "eval_entropy": 0.13763439720091614, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1060665490.0, "eval_reward": 0.5670366961023082, "eval_reward_std": 0.46804932407710864, "eval_rewards/TRLRewardAdapter/mean": 0.5670367064683334, "eval_rewards/TRLRewardAdapter/std": 0.46804932537286176, "eval_runtime": 1371.6965, "eval_samples_per_second": 3.329, "eval_sampling/importance_sampling_ratio/max": 1.8031463571216748, "eval_sampling/importance_sampling_ratio/mean": 0.4354217998359514, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 11.196097446524579, "eval_sampling/sampling_logp_difference/mean": 0.0187706767540911, "eval_steps_per_second": 0.017, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9249.0, "completions/mean_length": 1521.0240478515625, "completions/mean_terminated_length": 1458.743896484375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.1339197059472402, "epoch": 0.29439252336448596, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00014237446121685747, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1062608745.0, "reward": 0.7958080768585205, "reward_std": 0.3131636381149292, "rewards/TRLRewardAdapter/mean": 0.7958080172538757, "rewards/TRLRewardAdapter/std": 0.3131636083126068, "sampling/importance_sampling_ratio/max": 1.7913273572921753, "sampling/importance_sampling_ratio/mean": 0.14956077933311462, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.25, "sampling/sampling_logp_difference/mean": 0.018366774544119835, "step": 441, "step_time": 170.85905134119093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9924.0, "completions/mean_length": 1822.9063720703125, "completions/mean_terminated_length": 1522.6673583984375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.14660820364952087, "epoch": 0.29506008010680906, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 7.18018544648677e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1064750735.0, "reward": 0.6770097613334656, "reward_std": 0.39446181058883667, "rewards/TRLRewardAdapter/mean": 0.6770097017288208, "rewards/TRLRewardAdapter/std": 0.39446181058883667, "sampling/importance_sampling_ratio/max": 1.466745376586914, "sampling/importance_sampling_ratio/mean": 0.05353599786758423, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.0, "sampling/sampling_logp_difference/mean": 0.020100468769669533, "step": 442, "step_time": 326.9499144658912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04375000298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9983.0, "completions/mean_length": 2837.310546875, "completions/mean_terminated_length": 2509.605712890625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1508529782295227, "epoch": 0.29572763684913217, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.998193236700814e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1067936473.0, "reward": 0.661625862121582, "reward_std": 0.3770330548286438, "rewards/TRLRewardAdapter/mean": 0.6616258025169373, "rewards/TRLRewardAdapter/std": 0.3770330250263214, "sampling/importance_sampling_ratio/max": 2.0324759483337402, "sampling/importance_sampling_ratio/mean": 0.09827671200037003, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.875, "sampling/sampling_logp_difference/mean": 0.020297380164265633, "step": 443, "step_time": 355.7978513709968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9881.0, "completions/mean_length": 2324.136474609375, "completions/mean_terminated_length": 1964.199462890625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.1281721517443657, "epoch": 0.2963951935914553, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 8.783551253114223e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1070659292.0, "reward": 0.6710386276245117, "reward_std": 0.3909524083137512, "rewards/TRLRewardAdapter/mean": 0.6710385680198669, "rewards/TRLRewardAdapter/std": 0.39095237851142883, "sampling/importance_sampling_ratio/max": 1.3135050535202026, "sampling/importance_sampling_ratio/mean": 0.10854421555995941, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.253543853759766, "sampling/sampling_logp_difference/mean": 0.017920782789587975, "step": 444, "step_time": 308.0118069353048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 8405.0, "completions/mean_length": 1516.9188232421875, "completions/mean_terminated_length": 1418.590087890625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.14088616768519083, "epoch": 0.2970627503337784, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00016056104477905805, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1072557294.0, "reward": 0.7483590245246887, "reward_std": 0.35885515809059143, "rewards/TRLRewardAdapter/mean": 0.748358964920044, "rewards/TRLRewardAdapter/std": 0.35885512828826904, "sampling/importance_sampling_ratio/max": 1.2652015686035156, "sampling/importance_sampling_ratio/mean": 0.14782318472862244, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.8566031455993652, "sampling/sampling_logp_difference/mean": 0.019155627116560936, "step": 445, "step_time": 168.53015929891262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9438.0, "completions/mean_length": 1839.868896484375, "completions/mean_terminated_length": 1736.5758056640625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.12924895932277045, "epoch": 0.2977303070761015, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00011015875897609374, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1074739664.0, "reward": 0.7420682907104492, "reward_std": 0.3695714771747589, "rewards/TRLRewardAdapter/mean": 0.7420682311058044, "rewards/TRLRewardAdapter/std": 0.3695714473724365, "sampling/importance_sampling_ratio/max": 1.9629265069961548, "sampling/importance_sampling_ratio/mean": 0.18020571768283844, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.998071670532227, "sampling/sampling_logp_difference/mean": 0.01786736026406288, "step": 446, "step_time": 270.29260785807855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052083334885537624, "completions/max_length": 10000.0, "completions/max_terminated_length": 9305.0, "completions/mean_length": 1327.9656982421875, "completions/mean_terminated_length": 1282.562255859375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.11922571435570717, "epoch": 0.2983978638184246, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 8.000186043740395e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1076441615.0, "reward": 0.7362866997718811, "reward_std": 0.3715859353542328, "rewards/TRLRewardAdapter/mean": 0.7362866997718811, "rewards/TRLRewardAdapter/std": 0.3715859651565552, "sampling/importance_sampling_ratio/max": 1.6425632238388062, "sampling/importance_sampling_ratio/mean": 0.15398268401622772, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.75, "sampling/sampling_logp_difference/mean": 0.017164824530482292, "step": 447, "step_time": 262.92641219799407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9682.0, "completions/mean_length": 1452.8792724609375, "completions/mean_terminated_length": 1252.4136962890625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.12010694791873296, "epoch": 0.29906542056074764, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.092225495615177e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1078304123.0, "reward": 0.7731122374534607, "reward_std": 0.36165565252304077, "rewards/TRLRewardAdapter/mean": 0.7731121778488159, "rewards/TRLRewardAdapter/std": 0.36165568232536316, "sampling/importance_sampling_ratio/max": 1.7371288537979126, "sampling/importance_sampling_ratio/mean": 0.20017072558403015, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.5, "sampling/sampling_logp_difference/mean": 0.016630901023745537, "step": 448, "step_time": 355.0532427370781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03020833432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9667.0, "completions/mean_length": 2138.065673828125, "completions/mean_terminated_length": 1893.171875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1515829861164093, "epoch": 0.29973297730307075, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.1568184260561617e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1080780986.0, "reward": 0.7101452350616455, "reward_std": 0.37155961990356445, "rewards/TRLRewardAdapter/mean": 0.7101451754570007, "rewards/TRLRewardAdapter/std": 0.37155961990356445, "sampling/importance_sampling_ratio/max": 1.8623933792114258, "sampling/importance_sampling_ratio/mean": 0.12047535181045532, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.5, "sampling/sampling_logp_difference/mean": 0.020621713250875473, "step": 449, "step_time": 279.2279199080076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06145833805203438, "completions/max_length": 10000.0, "completions/max_terminated_length": 9869.0, "completions/mean_length": 2160.478271484375, "completions/mean_terminated_length": 1647.1243896484375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.16240123411019644, "epoch": 0.30040053404539385, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.469532585230504e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1083348709.0, "reward": 0.7003380656242371, "reward_std": 0.4011860489845276, "rewards/TRLRewardAdapter/mean": 0.7003380656242371, "rewards/TRLRewardAdapter/std": 0.4011860191822052, "sampling/importance_sampling_ratio/max": 2.059426784515381, "sampling/importance_sampling_ratio/mean": 0.11138510704040527, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.862744331359863, "sampling/sampling_logp_difference/mean": 0.022290954366326332, "step": 450, "step_time": 353.4074497248512 }, { "epoch": 0.30040053404539385, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.016304347340179527, "eval_completions/max_length": 9938.391304347826, "eval_completions/max_terminated_length": 8349.521739130434, "eval_completions/mean_length": 879.2095416525136, "eval_completions/mean_terminated_length": 728.0983369246773, "eval_completions/min_length": 46.26086956521739, "eval_completions/min_terminated_length": 46.26086956521739, "eval_entropy": 0.13127460265937058, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1083348709.0, "eval_reward": 0.5757328090460404, "eval_reward_std": 0.4661246421544448, "eval_rewards/TRLRewardAdapter/mean": 0.5757328220035719, "eval_rewards/TRLRewardAdapter/std": 0.46612464345019794, "eval_runtime": 1349.9714, "eval_samples_per_second": 3.382, "eval_sampling/importance_sampling_ratio/max": 2.086908387101215, "eval_sampling/importance_sampling_ratio/mean": 0.44584326251693396, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 5.100898545721303, "eval_sampling/sampling_logp_difference/mean": 0.018112582033095154, "eval_steps_per_second": 0.017, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9845.0, "completions/mean_length": 1519.6136474609375, "completions/mean_terminated_length": 1439.357421875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.1164094905058543, "epoch": 0.30106809078771696, "frac_reward_zero_std": 0.0, "grad_norm": 0.00010813629225041508, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1085267570.0, "reward": 0.7602836489677429, "reward_std": 0.3460500240325928, "rewards/TRLRewardAdapter/mean": 0.7602835893630981, "rewards/TRLRewardAdapter/std": 0.34605005383491516, "sampling/importance_sampling_ratio/max": 1.5390955209732056, "sampling/importance_sampling_ratio/mean": 0.09526308625936508, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.43804931640625, "sampling/sampling_logp_difference/mean": 0.016416369006037712, "step": 451, "step_time": 194.08848080399912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9515.0, "completions/mean_length": 1538.44482421875, "completions/mean_terminated_length": 1189.7039794921875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.13455420235792795, "epoch": 0.30173564753004006, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 4.2653540308775205e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1087179613.0, "reward": 0.7378401756286621, "reward_std": 0.38395896553993225, "rewards/TRLRewardAdapter/mean": 0.7378401160240173, "rewards/TRLRewardAdapter/std": 0.38395893573760986, "sampling/importance_sampling_ratio/max": 1.8240032196044922, "sampling/importance_sampling_ratio/mean": 0.12777462601661682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.5, "sampling/sampling_logp_difference/mean": 0.018433628603816032, "step": 452, "step_time": 280.2858090089867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9977.0, "completions/mean_length": 1602.31884765625, "completions/mean_terminated_length": 1322.0947265625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.12304471929868062, "epoch": 0.30240320427236317, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 2.076289269685158e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1089106767.0, "reward": 0.7308738231658936, "reward_std": 0.398247629404068, "rewards/TRLRewardAdapter/mean": 0.7308738231658936, "rewards/TRLRewardAdapter/std": 0.3982475697994232, "sampling/importance_sampling_ratio/max": 1.7882709503173828, "sampling/importance_sampling_ratio/mean": 0.16929270327091217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.309269905090332, "sampling/sampling_logp_difference/mean": 0.01733020506799221, "step": 453, "step_time": 353.28661172592547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 8889.0, "completions/mean_length": 1921.800048828125, "completions/mean_terminated_length": 1688.025634765625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.1400022655725479, "epoch": 0.3030707610146863, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.5665438961663598e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1091383343.0, "reward": 0.7241469621658325, "reward_std": 0.36844944953918457, "rewards/TRLRewardAdapter/mean": 0.7241469025611877, "rewards/TRLRewardAdapter/std": 0.36844944953918457, "sampling/importance_sampling_ratio/max": 1.5347509384155273, "sampling/importance_sampling_ratio/mean": 0.13453052937984467, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.0, "sampling/sampling_logp_difference/mean": 0.01880502700805664, "step": 454, "step_time": 204.68793746491428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9552.0, "completions/mean_length": 1676.623046875, "completions/mean_terminated_length": 1444.9228515625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.12734531983733177, "epoch": 0.3037383177570093, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00015638595355830576, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1093457477.0, "reward": 0.7606227993965149, "reward_std": 0.34416642785072327, "rewards/TRLRewardAdapter/mean": 0.7606227993965149, "rewards/TRLRewardAdapter/std": 0.34416642785072327, "sampling/importance_sampling_ratio/max": 2.2178127765655518, "sampling/importance_sampling_ratio/mean": 0.15247192978858948, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.387245178222656, "sampling/sampling_logp_difference/mean": 0.017657829448580742, "step": 455, "step_time": 288.1592599049909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02500000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9509.0, "completions/mean_length": 1556.300048828125, "completions/mean_terminated_length": 1339.794921875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.12159636616706848, "epoch": 0.30440587449933243, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0004494922788587261, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 1095353125.0, "reward": 0.7639448046684265, "reward_std": 0.3388427793979645, "rewards/TRLRewardAdapter/mean": 0.7639447450637817, "rewards/TRLRewardAdapter/std": 0.3388427793979645, "sampling/importance_sampling_ratio/max": 2.3663089275360107, "sampling/importance_sampling_ratio/mean": 0.12197338044643402, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.125, "sampling/sampling_logp_difference/mean": 0.017055800184607506, "step": 456, "step_time": 314.4909379669698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9949.0, "completions/mean_length": 1790.416748046875, "completions/mean_terminated_length": 1695.258056640625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.13158567001422247, "epoch": 0.30507343124165553, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00010546026275937773, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1097584629.0, "reward": 0.684446394443512, "reward_std": 0.38439276814460754, "rewards/TRLRewardAdapter/mean": 0.6844463348388672, "rewards/TRLRewardAdapter/std": 0.38439276814460754, "sampling/importance_sampling_ratio/max": 1.5260934829711914, "sampling/importance_sampling_ratio/mean": 0.121995709836483, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.75, "sampling/sampling_logp_difference/mean": 0.017944378778338432, "step": 457, "step_time": 179.93881693901494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05520833656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 7989.0, "completions/mean_length": 1717.201171875, "completions/mean_terminated_length": 1233.20068359375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.12508745367328325, "epoch": 0.30574098798397864, "frac_reward_zero_std": 0.0, "grad_norm": 1.8664346437013276e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1099660790.0, "reward": 0.7720000147819519, "reward_std": 0.34740713238716125, "rewards/TRLRewardAdapter/mean": 0.7720000147819519, "rewards/TRLRewardAdapter/std": 0.34740710258483887, "sampling/importance_sampling_ratio/max": 2.5649466514587402, "sampling/importance_sampling_ratio/mean": 0.16928842663764954, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.12307071685791, "sampling/sampling_logp_difference/mean": 0.017549658194184303, "step": 458, "step_time": 366.96951941319276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06145833805203438, "completions/max_length": 10000.0, "completions/max_terminated_length": 9809.0, "completions/mean_length": 1993.69287109375, "completions/mean_terminated_length": 1469.4173583984375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.13449877500534058, "epoch": 0.30640854472630175, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.2345129494692192e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1102017903.0, "reward": 0.7173423767089844, "reward_std": 0.39526909589767456, "rewards/TRLRewardAdapter/mean": 0.7173423171043396, "rewards/TRLRewardAdapter/std": 0.39526909589767456, "sampling/importance_sampling_ratio/max": 1.7217919826507568, "sampling/importance_sampling_ratio/mean": 0.1831737756729126, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.248069763183594, "sampling/sampling_logp_difference/mean": 0.01842273585498333, "step": 459, "step_time": 391.75563450180925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9920.0, "completions/mean_length": 1966.901123046875, "completions/mean_terminated_length": 1662.946044921875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15694565822680792, "epoch": 0.30707610146862485, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.3812721811095862e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1104337744.0, "reward": 0.6677801012992859, "reward_std": 0.4045330882072449, "rewards/TRLRewardAdapter/mean": 0.6677800416946411, "rewards/TRLRewardAdapter/std": 0.4045330584049225, "sampling/importance_sampling_ratio/max": 1.3031691312789917, "sampling/importance_sampling_ratio/mean": 0.07409267127513885, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.375, "sampling/sampling_logp_difference/mean": 0.020821204409003258, "step": 460, "step_time": 237.84823096299078 }, { "epoch": 0.30707610146862485, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.015217390842735767, "eval_completions/max_length": 9784.347826086956, "eval_completions/max_terminated_length": 8277.130434782608, "eval_completions/mean_length": 780.9382390561311, "eval_completions/mean_terminated_length": 638.5773978855299, "eval_completions/min_length": 42.91304347826087, "eval_completions/min_terminated_length": 42.91304347826087, "eval_entropy": 0.12822301556234775, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1104337744.0, "eval_reward": 0.571580206570418, "eval_reward_std": 0.4703063252179519, "eval_rewards/TRLRewardAdapter/mean": 0.5715802182321963, "eval_rewards/TRLRewardAdapter/std": 0.47030632133069245, "eval_runtime": 1318.0746, "eval_samples_per_second": 3.464, "eval_sampling/importance_sampling_ratio/max": 2.0289980846902598, "eval_sampling/importance_sampling_ratio/mean": 0.4685443795245627, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.42632730629133, "eval_sampling/sampling_logp_difference/mean": 0.017828825537277306, "eval_steps_per_second": 0.017, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05416667088866234, "completions/max_length": 10000.0, "completions/max_terminated_length": 9948.0, "completions/mean_length": 2104.322998046875, "completions/mean_terminated_length": 1652.1474609375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1384902944167455, "epoch": 0.30774365821094796, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00010786232523568076, "learning_rate": 5e-06, "loss": 0.0025, "num_tokens": 1106787174.0, "reward": 0.6643402576446533, "reward_std": 0.421670138835907, "rewards/TRLRewardAdapter/mean": 0.6643401980400085, "rewards/TRLRewardAdapter/std": 0.4216701090335846, "sampling/importance_sampling_ratio/max": 1.5634605884552002, "sampling/importance_sampling_ratio/mean": 0.14923112094402313, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.3624210357666, "sampling/sampling_logp_difference/mean": 0.018605276942253113, "step": 461, "step_time": 379.5581347740954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05104167014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9590.0, "completions/mean_length": 2419.58447265625, "completions/mean_terminated_length": 2011.856201171875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.14195796847343445, "epoch": 0.308411214953271, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 8.58436126508363e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1109583831.0, "reward": 0.5997645258903503, "reward_std": 0.4448320269584656, "rewards/TRLRewardAdapter/mean": 0.5997645258903503, "rewards/TRLRewardAdapter/std": 0.4448320269584656, "sampling/importance_sampling_ratio/max": 1.634164571762085, "sampling/importance_sampling_ratio/mean": 0.1126721128821373, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.0, "sampling/sampling_logp_difference/mean": 0.01929285377264023, "step": 462, "step_time": 364.14394916500896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 7320.0, "completions/mean_length": 892.2990112304688, "completions/mean_terminated_length": 873.2849731445312, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.13416030009587607, "epoch": 0.3090787716955941, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 7.21046808397878e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1110882678.0, "reward": 0.8008140921592712, "reward_std": 0.3337944447994232, "rewards/TRLRewardAdapter/mean": 0.8008140325546265, "rewards/TRLRewardAdapter/std": 0.3337944447994232, "sampling/importance_sampling_ratio/max": 1.9674084186553955, "sampling/importance_sampling_ratio/mean": 0.14492559432983398, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.637245178222656, "sampling/sampling_logp_difference/mean": 0.018747767433524132, "step": 463, "step_time": 101.94672154204454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 7185.0, "completions/mean_length": 866.784423828125, "completions/mean_terminated_length": 662.5271606445312, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.10746399809916814, "epoch": 0.3097463284379172, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.339311734557559e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1112161415.0, "reward": 0.8570549488067627, "reward_std": 0.29453879594802856, "rewards/TRLRewardAdapter/mean": 0.8570548892021179, "rewards/TRLRewardAdapter/std": 0.29453879594802856, "sampling/importance_sampling_ratio/max": 2.88474178314209, "sampling/importance_sampling_ratio/mean": 0.27043747901916504, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.851531982421875, "sampling/sampling_logp_difference/mean": 0.015491249971091747, "step": 464, "step_time": 267.1496363970218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01666666753590107, "completions/max_length": 10000.0, "completions/max_terminated_length": 8257.0, "completions/mean_length": 1039.9344482421875, "completions/mean_terminated_length": 888.06884765625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.12312941004832585, "epoch": 0.3104138851802403, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00012551630905463876, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 1113626472.0, "reward": 0.7911298871040344, "reward_std": 0.35457006096839905, "rewards/TRLRewardAdapter/mean": 0.7911298871040344, "rewards/TRLRewardAdapter/std": 0.35457003116607666, "sampling/importance_sampling_ratio/max": 1.849848747253418, "sampling/importance_sampling_ratio/mean": 0.19412913918495178, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.753543376922607, "sampling/sampling_logp_difference/mean": 0.018196595832705498, "step": 465, "step_time": 267.7232872841414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 6609.0, "completions/mean_length": 1051.3896484375, "completions/mean_terminated_length": 890.0679321289062, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1130288653075695, "epoch": 0.31108144192256343, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00031192767912245937, "learning_rate": 5e-06, "loss": 0.0022, "num_tokens": 1115060734.0, "reward": 0.7763126492500305, "reward_std": 0.3531741797924042, "rewards/TRLRewardAdapter/mean": 0.7763126492500305, "rewards/TRLRewardAdapter/std": 0.3531741499900818, "sampling/importance_sampling_ratio/max": 2.111855983734131, "sampling/importance_sampling_ratio/mean": 0.22941944003105164, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.362421035766602, "sampling/sampling_logp_difference/mean": 0.015924865379929543, "step": 466, "step_time": 190.29956065595616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9750.0, "completions/mean_length": 1191.0447998046875, "completions/mean_terminated_length": 1172.654541015625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.13259267310301462, "epoch": 0.31174899866488653, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 8.032408783024695e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1116624009.0, "reward": 0.8118919730186462, "reward_std": 0.3052067160606384, "rewards/TRLRewardAdapter/mean": 0.8118919134140015, "rewards/TRLRewardAdapter/std": 0.30520668625831604, "sampling/importance_sampling_ratio/max": 1.6159693002700806, "sampling/importance_sampling_ratio/mean": 0.1328769028186798, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.390133857727051, "sampling/sampling_logp_difference/mean": 0.018322162330150604, "step": 467, "step_time": 153.91164600709453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9833.0, "completions/mean_length": 1731.628173828125, "completions/mean_terminated_length": 1546.71240234375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.13372106850147247, "epoch": 0.31241655540720964, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.541234190137748e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1118723876.0, "reward": 0.760236918926239, "reward_std": 0.34924471378326416, "rewards/TRLRewardAdapter/mean": 0.7602368593215942, "rewards/TRLRewardAdapter/std": 0.34924474358558655, "sampling/importance_sampling_ratio/max": 1.7984751462936401, "sampling/importance_sampling_ratio/mean": 0.08759679645299911, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.5, "sampling/sampling_logp_difference/mean": 0.01825888268649578, "step": 468, "step_time": 293.61412092193495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9124.0, "completions/mean_length": 1177.4459228515625, "completions/mean_terminated_length": 1159.0272216796875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1421645258863767, "epoch": 0.3130841121495327, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 1.5909933848738365e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1120271312.0, "reward": 0.768653929233551, "reward_std": 0.35597506165504456, "rewards/TRLRewardAdapter/mean": 0.7686538696289062, "rewards/TRLRewardAdapter/std": 0.35597506165504456, "sampling/importance_sampling_ratio/max": 2.3194234371185303, "sampling/importance_sampling_ratio/mean": 0.1635880023241043, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.28125, "sampling/sampling_logp_difference/mean": 0.019662223756313324, "step": 469, "step_time": 165.14670492114965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9984.0, "completions/mean_length": 1751.868896484375, "completions/mean_terminated_length": 1430.512939453125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.1539461985230446, "epoch": 0.3137516688918558, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0003032670038798173, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 1122445330.0, "reward": 0.6834098696708679, "reward_std": 0.4167514145374298, "rewards/TRLRewardAdapter/mean": 0.6834098100662231, "rewards/TRLRewardAdapter/std": 0.4167513847351074, "sampling/importance_sampling_ratio/max": 2.1342036724090576, "sampling/importance_sampling_ratio/mean": 0.18382127583026886, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.5, "sampling/sampling_logp_difference/mean": 0.020521605387330055, "step": 470, "step_time": 311.40440267289523 }, { "epoch": 0.3137516688918558, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.013478260527810326, "eval_completions/max_length": 9961.260869565218, "eval_completions/max_terminated_length": 7978.304347826087, "eval_completions/mean_length": 698.9560732634171, "eval_completions/mean_terminated_length": 572.1074431046196, "eval_completions/min_length": 35.56521739130435, "eval_completions/min_terminated_length": 35.56521739130435, "eval_entropy": 0.12874137156683466, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1122445330.0, "eval_reward": 0.5708152224188265, "eval_reward_std": 0.4737745329089787, "eval_rewards/TRLRewardAdapter/mean": 0.5708152327848517, "eval_rewards/TRLRewardAdapter/std": 0.4737745393877444, "eval_runtime": 1347.615, "eval_samples_per_second": 3.388, "eval_sampling/importance_sampling_ratio/max": 2.0178773403167725, "eval_sampling/importance_sampling_ratio/mean": 0.5024278643338577, "eval_sampling/importance_sampling_ratio/min": 3.274328825159537e-41, "eval_sampling/sampling_logp_difference/max": 4.409424387890359, "eval_sampling/sampling_logp_difference/mean": 0.017932492915702904, "eval_steps_per_second": 0.017, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9889.0, "completions/mean_length": 1856.2969970703125, "completions/mean_terminated_length": 1575.4794921875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.1510892411073049, "epoch": 0.3144192256341789, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00011354507528444527, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1124673423.0, "reward": 0.7210040092468262, "reward_std": 0.3806203603744507, "rewards/TRLRewardAdapter/mean": 0.7210039496421814, "rewards/TRLRewardAdapter/std": 0.3806203305721283, "sampling/importance_sampling_ratio/max": 2.024156332015991, "sampling/importance_sampling_ratio/mean": 0.1742403209209442, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.277875900268555, "sampling/sampling_logp_difference/mean": 0.020530758425593376, "step": 471, "step_time": 315.2219439339824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07604166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9736.0, "completions/mean_length": 2284.821044921875, "completions/mean_terminated_length": 1649.8624267578125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.142660786708196, "epoch": 0.315086782376502, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.000104906957603836, "learning_rate": 5e-06, "loss": 0.0006, "num_tokens": 1127315651.0, "reward": 0.6917271018028259, "reward_std": 0.4006955027580261, "rewards/TRLRewardAdapter/mean": 0.6917270421981812, "rewards/TRLRewardAdapter/std": 0.40069547295570374, "sampling/importance_sampling_ratio/max": 1.179449439048767, "sampling/importance_sampling_ratio/mean": 0.1331029236316681, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.625, "sampling/sampling_logp_difference/mean": 0.018781526014208794, "step": 472, "step_time": 327.01129676692653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05104167014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9451.0, "completions/mean_length": 1777.466796875, "completions/mean_terminated_length": 1335.200927734375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18189746141433716, "epoch": 0.3157543391188251, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00013236682541343527, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1129412003.0, "reward": 0.6694416403770447, "reward_std": 0.42682284116744995, "rewards/TRLRewardAdapter/mean": 0.6694415807723999, "rewards/TRLRewardAdapter/std": 0.42682284116744995, "sampling/importance_sampling_ratio/max": 1.5294855833053589, "sampling/importance_sampling_ratio/mean": 0.19002680480480194, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.213584899902344, "sampling/sampling_logp_difference/mean": 0.021680200472474098, "step": 473, "step_time": 269.9501321921125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 9890.0, "completions/mean_length": 1702.425048828125, "completions/mean_terminated_length": 1434.7613525390625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.15067343910535178, "epoch": 0.3164218958611482, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.3536141428764015e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1131479867.0, "reward": 0.6814439296722412, "reward_std": 0.4124011695384979, "rewards/TRLRewardAdapter/mean": 0.6814439296722412, "rewards/TRLRewardAdapter/std": 0.4124011695384979, "sampling/importance_sampling_ratio/max": 2.2327444553375244, "sampling/importance_sampling_ratio/mean": 0.13615484535694122, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.0, "sampling/sampling_logp_difference/mean": 0.02032286301255226, "step": 474, "step_time": 343.25317218585405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9104.0, "completions/mean_length": 1564.7427978515625, "completions/mean_terminated_length": 1376.0947265625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.14932885517676672, "epoch": 0.3170894526034713, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00023649647192306938, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1133415972.0, "reward": 0.6937757134437561, "reward_std": 0.401867538690567, "rewards/TRLRewardAdapter/mean": 0.6937757134437561, "rewards/TRLRewardAdapter/std": 0.401867538690567, "sampling/importance_sampling_ratio/max": 1.8092361688613892, "sampling/importance_sampling_ratio/mean": 0.14728862047195435, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.132781982421875, "sampling/sampling_logp_difference/mean": 0.0204616691917181, "step": 475, "step_time": 265.6745270539541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9997.0, "completions/mean_length": 1211.6490478515625, "completions/mean_terminated_length": 1128.4783935546875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.14256848643223444, "epoch": 0.3177570093457944, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0006774781848029593, "learning_rate": 5e-06, "loss": -0.0003, "num_tokens": 1135048371.0, "reward": 0.6553782224655151, "reward_std": 0.436519593000412, "rewards/TRLRewardAdapter/mean": 0.6553782224655151, "rewards/TRLRewardAdapter/std": 0.436519593000412, "sampling/importance_sampling_ratio/max": 2.292904853820801, "sampling/importance_sampling_ratio/mean": 0.14191660284996033, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.75, "sampling/sampling_logp_difference/mean": 0.01995532214641571, "step": 476, "step_time": 201.82967737305444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 9315.0, "completions/mean_length": 1603.5313720703125, "completions/mean_terminated_length": 1332.677490234375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.15634546677271524, "epoch": 0.3184245660881175, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 5.224401191816429e-05, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1137053553.0, "reward": 0.6716458797454834, "reward_std": 0.4151684641838074, "rewards/TRLRewardAdapter/mean": 0.6716458797454834, "rewards/TRLRewardAdapter/std": 0.415168434381485, "sampling/importance_sampling_ratio/max": 2.552377939224243, "sampling/importance_sampling_ratio/mean": 0.12332174926996231, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.5, "sampling/sampling_logp_difference/mean": 0.020490551367402077, "step": 477, "step_time": 253.0815248900326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5661.0, "completions/max_terminated_length": 5661.0, "completions/mean_length": 705.6260986328125, "completions/mean_terminated_length": 705.6260986328125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.13356670488913855, "epoch": 0.3190921228304406, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.566107996170886e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1138170154.0, "reward": 0.8253412246704102, "reward_std": 0.32158344984054565, "rewards/TRLRewardAdapter/mean": 0.8253411650657654, "rewards/TRLRewardAdapter/std": 0.32158342003822327, "sampling/importance_sampling_ratio/max": 2.365358591079712, "sampling/importance_sampling_ratio/mean": 0.19510158896446228, "sampling/importance_sampling_ratio/min": 1.401298464324817e-44, "sampling/sampling_logp_difference/max": 11.25, "sampling/sampling_logp_difference/mean": 0.01855531707406044, "step": 478, "step_time": 57.30444364901632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9906.0, "completions/mean_length": 1815.885498046875, "completions/mean_terminated_length": 1668.3458251953125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.15962567180395126, "epoch": 0.3197596795727637, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0004816850332332863, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 1140376508.0, "reward": 0.6153501868247986, "reward_std": 0.43704453110694885, "rewards/TRLRewardAdapter/mean": 0.6153501272201538, "rewards/TRLRewardAdapter/std": 0.43704453110694885, "sampling/importance_sampling_ratio/max": 2.349919080734253, "sampling/importance_sampling_ratio/mean": 0.1161067932844162, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.931852340698242, "sampling/sampling_logp_difference/mean": 0.02152225747704506, "step": 479, "step_time": 260.20068631717004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04583333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9778.0, "completions/mean_length": 1798.3834228515625, "completions/mean_terminated_length": 1404.419189453125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.14237653215726218, "epoch": 0.3204272363150868, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.0001328156433916704, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1142483052.0, "reward": 0.6467602252960205, "reward_std": 0.42773106694221497, "rewards/TRLRewardAdapter/mean": 0.6467601656913757, "rewards/TRLRewardAdapter/std": 0.42773106694221497, "sampling/importance_sampling_ratio/max": 1.320502758026123, "sampling/importance_sampling_ratio/mean": 0.1327502429485321, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.5625, "sampling/sampling_logp_difference/mean": 0.01975078694522381, "step": 480, "step_time": 361.53687628300395 }, { "epoch": 0.3204272363150868, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.018478260294574757, "eval_completions/max_length": 9894.0, "eval_completions/max_terminated_length": 7243.347826086957, "eval_completions/mean_length": 726.6880254330842, "eval_completions/mean_terminated_length": 552.230921206267, "eval_completions/min_length": 30.82608695652174, "eval_completions/min_terminated_length": 30.82608695652174, "eval_entropy": 0.14289194831381674, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1142483052.0, "eval_reward": 0.5712977038777393, "eval_reward_std": 0.47361812125081604, "eval_rewards/TRLRewardAdapter/mean": 0.5712977220182833, "eval_rewards/TRLRewardAdapter/std": 0.4736181290253349, "eval_runtime": 1338.1691, "eval_samples_per_second": 3.412, "eval_sampling/importance_sampling_ratio/max": 2.0513690813728003, "eval_sampling/importance_sampling_ratio/mean": 0.4796307955099189, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.125092449395552, "eval_sampling/sampling_logp_difference/mean": 0.019478026210613873, "eval_steps_per_second": 0.017, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0260416679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 9854.0, "completions/mean_length": 1614.925048828125, "completions/mean_terminated_length": 1390.7252197265625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.15162128458420435, "epoch": 0.3210947930574099, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.00017029700885454345, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1144530052.0, "reward": 0.6693063974380493, "reward_std": 0.42378881573677063, "rewards/TRLRewardAdapter/mean": 0.6693063974380493, "rewards/TRLRewardAdapter/std": 0.42378875613212585, "sampling/importance_sampling_ratio/max": 2.7290875911712646, "sampling/importance_sampling_ratio/mean": 0.15938401222229004, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.375, "sampling/sampling_logp_difference/mean": 0.02150644175708294, "step": 481, "step_time": 294.3449838120723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 8992.0, "completions/mean_length": 1380.7427978515625, "completions/mean_terminated_length": 1025.502197265625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.16551095495621362, "epoch": 0.32176234979973295, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00042823567945698813, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1146263341.0, "reward": 0.7496243119239807, "reward_std": 0.39322158694267273, "rewards/TRLRewardAdapter/mean": 0.7496242523193359, "rewards/TRLRewardAdapter/std": 0.39322158694267273, "sampling/importance_sampling_ratio/max": 1.6557434797286987, "sampling/importance_sampling_ratio/mean": 0.13640671968460083, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.25, "sampling/sampling_logp_difference/mean": 0.02245541289448738, "step": 482, "step_time": 368.4741881591035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06145833805203438, "completions/max_length": 10000.0, "completions/max_terminated_length": 9723.0, "completions/mean_length": 1788.779296875, "completions/mean_terminated_length": 1251.08544921875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.13768712927897772, "epoch": 0.32242990654205606, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.557773872157847e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1148397017.0, "reward": 0.7339757680892944, "reward_std": 0.3973951041698456, "rewards/TRLRewardAdapter/mean": 0.7339757084846497, "rewards/TRLRewardAdapter/std": 0.3973951041698456, "sampling/importance_sampling_ratio/max": 1.9516383409500122, "sampling/importance_sampling_ratio/mean": 0.1830388754606247, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.49374771118164, "sampling/sampling_logp_difference/mean": 0.01931105926632881, "step": 483, "step_time": 334.8791346300859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06979166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 8251.0, "completions/mean_length": 1740.8917236328125, "completions/mean_terminated_length": 1121.2274169921875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.16260085006554922, "epoch": 0.32309746328437916, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 0.0013241277052487492, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 1150536881.0, "reward": 0.6694756150245667, "reward_std": 0.4251081943511963, "rewards/TRLRewardAdapter/mean": 0.6694755554199219, "rewards/TRLRewardAdapter/std": 0.4251081943511963, "sampling/importance_sampling_ratio/max": 2.505915403366089, "sampling/importance_sampling_ratio/mean": 0.15183928608894348, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.75, "sampling/sampling_logp_difference/mean": 0.021897850558161736, "step": 484, "step_time": 369.33717213990167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 8803.0, "completions/mean_length": 1964.58349609375, "completions/mean_terminated_length": 1624.3214111328125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1667043293515841, "epoch": 0.32376502002670227, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 5.47995247701662e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1152900225.0, "reward": 0.612661600112915, "reward_std": 0.4496302902698517, "rewards/TRLRewardAdapter/mean": 0.6126615405082703, "rewards/TRLRewardAdapter/std": 0.4496302604675293, "sampling/importance_sampling_ratio/max": 1.7000831365585327, "sampling/importance_sampling_ratio/mean": 0.21881048381328583, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.722213745117188, "sampling/sampling_logp_difference/mean": 0.022280097007751465, "step": 485, "step_time": 303.3173383260146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9068.0, "completions/mean_length": 1300.3167724609375, "completions/mean_terminated_length": 1291.2451171875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1688236196835836, "epoch": 0.32443257676902537, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00013571560736537323, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1154589713.0, "reward": 0.7488123774528503, "reward_std": 0.350400447845459, "rewards/TRLRewardAdapter/mean": 0.7488123774528503, "rewards/TRLRewardAdapter/std": 0.350400447845459, "sampling/importance_sampling_ratio/max": 2.095289707183838, "sampling/importance_sampling_ratio/mean": 0.09110398590564728, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.5, "sampling/sampling_logp_difference/mean": 0.022459886968135834, "step": 486, "step_time": 131.9149736490799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03541667014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9809.0, "completions/mean_length": 1639.5355224609375, "completions/mean_terminated_length": 1332.563720703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.17486637085676193, "epoch": 0.3251001335113485, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.000271661128660408, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1156618515.0, "reward": 0.6212724447250366, "reward_std": 0.4425092041492462, "rewards/TRLRewardAdapter/mean": 0.6212723851203918, "rewards/TRLRewardAdapter/std": 0.4425092041492462, "sampling/importance_sampling_ratio/max": 1.2587523460388184, "sampling/importance_sampling_ratio/mean": 0.11116810888051987, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.137245178222656, "sampling/sampling_logp_difference/mean": 0.02250073291361332, "step": 487, "step_time": 375.1880715958541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04791666939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9896.0, "completions/mean_length": 1938.41259765625, "completions/mean_terminated_length": 1532.6871337890625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.15120662997166315, "epoch": 0.3257676902536716, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 4.322489731091061e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1158889055.0, "reward": 0.7585658431053162, "reward_std": 0.3663095235824585, "rewards/TRLRewardAdapter/mean": 0.7585657835006714, "rewards/TRLRewardAdapter/std": 0.3663095235824585, "sampling/importance_sampling_ratio/max": 2.2215850353240967, "sampling/importance_sampling_ratio/mean": 0.13539783656597137, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.45545196533203, "sampling/sampling_logp_difference/mean": 0.02074335515499115, "step": 488, "step_time": 324.8826077438425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04270833730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9819.0, "completions/mean_length": 1845.291748046875, "completions/mean_terminated_length": 1481.4798583984375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.17812000960111618, "epoch": 0.32643524699599463, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0003034182071906594, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1161117559.0, "reward": 0.7166981101036072, "reward_std": 0.3842087984085083, "rewards/TRLRewardAdapter/mean": 0.7166980504989624, "rewards/TRLRewardAdapter/std": 0.3842087686061859, "sampling/importance_sampling_ratio/max": 2.209094762802124, "sampling/importance_sampling_ratio/mean": 0.16884931921958923, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.25, "sampling/sampling_logp_difference/mean": 0.023242805153131485, "step": 489, "step_time": 282.8519260629546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9769.0, "completions/mean_length": 2185.641845703125, "completions/mean_terminated_length": 1950.87548828125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.1685851737856865, "epoch": 0.32710280373831774, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.500481617270771e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1163681663.0, "reward": 0.6823210716247559, "reward_std": 0.3917297422885895, "rewards/TRLRewardAdapter/mean": 0.6823210120201111, "rewards/TRLRewardAdapter/std": 0.3917297422885895, "sampling/importance_sampling_ratio/max": 2.225994348526001, "sampling/importance_sampling_ratio/mean": 0.13135415315628052, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.171112060546875, "sampling/sampling_logp_difference/mean": 0.021968943998217583, "step": 490, "step_time": 255.47974084422458 }, { "epoch": 0.32710280373831774, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.022173912588344967, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 7395.869565217391, "eval_completions/mean_length": 832.797803795856, "eval_completions/mean_terminated_length": 625.1243246327276, "eval_completions/min_length": 32.26086956521739, "eval_completions/min_terminated_length": 32.26086956521739, "eval_entropy": 0.17291572625222412, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1163681663.0, "eval_reward": 0.5669787266980046, "eval_reward_std": 0.4716553053130274, "eval_rewards/TRLRewardAdapter/mean": 0.5669787422470425, "eval_rewards/TRLRewardAdapter/std": 0.4716553066087806, "eval_runtime": 1364.6959, "eval_samples_per_second": 3.346, "eval_sampling/importance_sampling_ratio/max": 2.184776731159376, "eval_sampling/importance_sampling_ratio/mean": 0.431898295879364, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 4.023249703904857, "eval_sampling/sampling_logp_difference/mean": 0.022742077870213467, "eval_steps_per_second": 0.017, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 8113.0, "completions/mean_length": 1361.2073974609375, "completions/mean_terminated_length": 1082.53662109375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.18182542423407236, "epoch": 0.32777036048064084, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00016032138098861804, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1165415366.0, "reward": 0.7338240146636963, "reward_std": 0.3729579746723175, "rewards/TRLRewardAdapter/mean": 0.7338239550590515, "rewards/TRLRewardAdapter/std": 0.3729579746723175, "sampling/importance_sampling_ratio/max": 1.6528823375701904, "sampling/importance_sampling_ratio/mean": 0.12400255352258682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.894009828567505, "sampling/sampling_logp_difference/mean": 0.023760691285133362, "step": 491, "step_time": 366.7387579830829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9794.0, "completions/mean_length": 2044.721923828125, "completions/mean_terminated_length": 1770.4019775390625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.2030633588631948, "epoch": 0.32843791722296395, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 5.4882387877305484e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1167833083.0, "reward": 0.68135005235672, "reward_std": 0.40752875804901123, "rewards/TRLRewardAdapter/mean": 0.6813499927520752, "rewards/TRLRewardAdapter/std": 0.40752872824668884, "sampling/importance_sampling_ratio/max": 2.6609363555908203, "sampling/importance_sampling_ratio/mean": 0.15550199151039124, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.889068603515625, "sampling/sampling_logp_difference/mean": 0.026530874893069267, "step": 492, "step_time": 313.59114656597376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06875000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9851.0, "completions/mean_length": 2025.478271484375, "completions/mean_terminated_length": 1436.7550048828125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.18492119759321213, "epoch": 0.32910547396528705, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.6739076343970725e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1170249414.0, "reward": 0.7316636443138123, "reward_std": 0.35951629281044006, "rewards/TRLRewardAdapter/mean": 0.7316635847091675, "rewards/TRLRewardAdapter/std": 0.35951632261276245, "sampling/importance_sampling_ratio/max": 2.0611870288848877, "sampling/importance_sampling_ratio/mean": 0.10774962604045868, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.878543853759766, "sampling/sampling_logp_difference/mean": 0.023943258449435234, "step": 493, "step_time": 397.8149154782295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9717.0, "completions/mean_length": 2279.0615234375, "completions/mean_terminated_length": 1917.0108642578125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.20447117338577905, "epoch": 0.32977303070761016, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.2690763442714426e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1172864289.0, "reward": 0.6363876461982727, "reward_std": 0.4154975712299347, "rewards/TRLRewardAdapter/mean": 0.6363876461982727, "rewards/TRLRewardAdapter/std": 0.4154975712299347, "sampling/importance_sampling_ratio/max": 2.3018267154693604, "sampling/importance_sampling_ratio/mean": 0.0437544621527195, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.5, "sampling/sampling_logp_difference/mean": 0.026322027668356895, "step": 494, "step_time": 246.00137807813007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9589.0, "completions/mean_length": 1569.494873046875, "completions/mean_terminated_length": 1278.7877197265625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.17804802457491556, "epoch": 0.33044058744993327, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.9141791842313875e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1174856700.0, "reward": 0.7421854138374329, "reward_std": 0.3740813732147217, "rewards/TRLRewardAdapter/mean": 0.7421854138374329, "rewards/TRLRewardAdapter/std": 0.3740813732147217, "sampling/importance_sampling_ratio/max": 1.8141990900039673, "sampling/importance_sampling_ratio/mean": 0.24616733193397522, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.263917922973633, "sampling/sampling_logp_difference/mean": 0.023210005834698677, "step": 495, "step_time": 295.112463326077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07395833730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9743.0, "completions/mean_length": 1846.029296875, "completions/mean_terminated_length": 1194.812255859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16553605844577154, "epoch": 0.3311081441922563, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.000288083374926876, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1177084376.0, "reward": 0.7757378816604614, "reward_std": 0.3538806438446045, "rewards/TRLRewardAdapter/mean": 0.7757378220558167, "rewards/TRLRewardAdapter/std": 0.3538806140422821, "sampling/importance_sampling_ratio/max": 2.247955083847046, "sampling/importance_sampling_ratio/mean": 0.13600388169288635, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.496212005615234, "sampling/sampling_logp_difference/mean": 0.02224390208721161, "step": 496, "step_time": 404.9773743778933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05312500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9938.0, "completions/mean_length": 1942.1021728515625, "completions/mean_terminated_length": 1490.0087890625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18828566372394562, "epoch": 0.3317757009345794, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00021840255756535652, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1179410682.0, "reward": 0.7472516894340515, "reward_std": 0.3541564345359802, "rewards/TRLRewardAdapter/mean": 0.7472516298294067, "rewards/TRLRewardAdapter/std": 0.3541564345359802, "sampling/importance_sampling_ratio/max": 2.0484161376953125, "sampling/importance_sampling_ratio/mean": 0.13096053898334503, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.68878746032715, "sampling/sampling_logp_difference/mean": 0.024409810081124306, "step": 497, "step_time": 264.54987795301713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 8435.0, "completions/mean_length": 2150.826171875, "completions/mean_terminated_length": 1682.9945068359375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.20991143584251404, "epoch": 0.3324432576769025, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00013516845346626404, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1181981907.0, "reward": 0.6519324779510498, "reward_std": 0.4050239622592926, "rewards/TRLRewardAdapter/mean": 0.6519324779510498, "rewards/TRLRewardAdapter/std": 0.4050239324569702, "sampling/importance_sampling_ratio/max": 1.771150827407837, "sampling/importance_sampling_ratio/mean": 0.08447426557540894, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.625, "sampling/sampling_logp_difference/mean": 0.026558799669146538, "step": 498, "step_time": 371.5339455730282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 10000.0, "completions/max_terminated_length": 8846.0, "completions/mean_length": 1665.690673828125, "completions/mean_terminated_length": 1303.329345703125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.2112407311797142, "epoch": 0.33311081441922563, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 4.4276765332261786e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1184008522.0, "reward": 0.7310296893119812, "reward_std": 0.37727856636047363, "rewards/TRLRewardAdapter/mean": 0.7310296297073364, "rewards/TRLRewardAdapter/std": 0.37727856636047363, "sampling/importance_sampling_ratio/max": 1.3328044414520264, "sampling/importance_sampling_ratio/mean": 0.11056562513113022, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.501155853271484, "sampling/sampling_logp_difference/mean": 0.02712889388203621, "step": 499, "step_time": 306.5680051088566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0468750037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9294.0, "completions/mean_length": 2230.22314453125, "completions/mean_terminated_length": 1848.102783203125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.19834584991137186, "epoch": 0.33377837116154874, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 7.498762571817365e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1186610048.0, "reward": 0.6966833472251892, "reward_std": 0.3858523964881897, "rewards/TRLRewardAdapter/mean": 0.6966832876205444, "rewards/TRLRewardAdapter/std": 0.3858523368835449, "sampling/importance_sampling_ratio/max": 1.9350123405456543, "sampling/importance_sampling_ratio/mean": 0.15264973044395447, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.5, "sampling/sampling_logp_difference/mean": 0.025467975065112114, "step": 500, "step_time": 281.1144681678852 }, { "epoch": 0.33377837116154874, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.021739129867890606, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 7512.826086956522, "eval_completions/mean_length": 870.1614990234375, "eval_completions/mean_terminated_length": 667.5173154084579, "eval_completions/min_length": 31.434782608695652, "eval_completions/min_terminated_length": 31.434782608695652, "eval_entropy": 0.18571184834708337, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1186610048.0, "eval_reward": 0.567176430121712, "eval_reward_std": 0.46998561853947846, "eval_rewards/TRLRewardAdapter/mean": 0.5671764430792435, "eval_rewards/TRLRewardAdapter/std": 0.4699856263139974, "eval_runtime": 1379.4519, "eval_samples_per_second": 3.31, "eval_sampling/importance_sampling_ratio/max": 2.1152656078338623, "eval_sampling/importance_sampling_ratio/mean": 0.4131690017555071, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.087159348570783, "eval_sampling/sampling_logp_difference/mean": 0.024203084328252335, "eval_steps_per_second": 0.017, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9399.0, "completions/mean_length": 2028.80224609375, "completions/mean_terminated_length": 1700.2711181640625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.20123462627331415, "epoch": 0.33444592790387184, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 0.00015914781274468866, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1189002370.0, "reward": 0.5788272619247437, "reward_std": 0.4487795829772949, "rewards/TRLRewardAdapter/mean": 0.5788272023200989, "rewards/TRLRewardAdapter/std": 0.4487795829772949, "sampling/importance_sampling_ratio/max": 2.1653690338134766, "sampling/importance_sampling_ratio/mean": 0.09585600346326828, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.0, "sampling/sampling_logp_difference/mean": 0.025316888466477394, "step": 501, "step_time": 319.4242523369612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9947.0, "completions/mean_length": 2193.40625, "completions/mean_terminated_length": 1976.092041015625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.19405356297890344, "epoch": 0.33511348464619495, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.7184927868837977e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1191553000.0, "reward": 0.6868009567260742, "reward_std": 0.3953346610069275, "rewards/TRLRewardAdapter/mean": 0.6868008971214294, "rewards/TRLRewardAdapter/std": 0.3953346908092499, "sampling/importance_sampling_ratio/max": 1.9082674980163574, "sampling/importance_sampling_ratio/mean": 0.1284399926662445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.208865165710449, "sampling/sampling_logp_difference/mean": 0.02595978043973446, "step": 502, "step_time": 262.2893954968313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03750000149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9602.0, "completions/mean_length": 1912.38134765625, "completions/mean_terminated_length": 1597.2791748046875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2041697899500529, "epoch": 0.335781041388518, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 8.967905017478174e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1193829430.0, "reward": 0.6172937750816345, "reward_std": 0.43361786007881165, "rewards/TRLRewardAdapter/mean": 0.6172937154769897, "rewards/TRLRewardAdapter/std": 0.43361786007881165, "sampling/importance_sampling_ratio/max": 1.8688385486602783, "sampling/importance_sampling_ratio/mean": 0.09471754729747772, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.387245178222656, "sampling/sampling_logp_difference/mean": 0.026471436023712158, "step": 503, "step_time": 306.95913217705674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9593.0, "completions/mean_length": 1446.2740478515625, "completions/mean_terminated_length": 1282.8270263671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2038698469599088, "epoch": 0.3364485981308411, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.0003269919567682824, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1195675229.0, "reward": 0.6155627369880676, "reward_std": 0.4433470368385315, "rewards/TRLRewardAdapter/mean": 0.6155626773834229, "rewards/TRLRewardAdapter/std": 0.4433470368385315, "sampling/importance_sampling_ratio/max": 1.3217064142227173, "sampling/importance_sampling_ratio/mean": 0.113059401512146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.809269905090332, "sampling/sampling_logp_difference/mean": 0.026238638907670975, "step": 504, "step_time": 270.0080957141472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 8515.0, "completions/mean_length": 1478.4114990234375, "completions/mean_terminated_length": 1315.57861328125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18830319245656332, "epoch": 0.3371161548731642, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00031163463839482786, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 1197506600.0, "reward": 0.7263709306716919, "reward_std": 0.38845887780189514, "rewards/TRLRewardAdapter/mean": 0.7263708710670471, "rewards/TRLRewardAdapter/std": 0.38845884799957275, "sampling/importance_sampling_ratio/max": 1.853257417678833, "sampling/importance_sampling_ratio/mean": 0.1488519161939621, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.306852340698242, "sampling/sampling_logp_difference/mean": 0.0248336773365736, "step": 505, "step_time": 165.64339051698335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05312500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 8013.0, "completions/mean_length": 2270.3564453125, "completions/mean_terminated_length": 1836.6798095703125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18708030382792154, "epoch": 0.3377837116154873, "frac_reward_zero_std": 0.23333334922790527, "grad_norm": 7.193888628642462e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1200142718.0, "reward": 0.5758631825447083, "reward_std": 0.4434363543987274, "rewards/TRLRewardAdapter/mean": 0.5758631229400635, "rewards/TRLRewardAdapter/std": 0.44343632459640503, "sampling/importance_sampling_ratio/max": 2.2239315509796143, "sampling/importance_sampling_ratio/mean": 0.11157344281673431, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.75, "sampling/sampling_logp_difference/mean": 0.024317843839526176, "step": 506, "step_time": 353.1430842189584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08229167014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 8967.0, "completions/mean_length": 2076.588623046875, "completions/mean_terminated_length": 1366.0897216796875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18312927087148032, "epoch": 0.3384512683578104, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 6.685991892466458e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1202631027.0, "reward": 0.6962519884109497, "reward_std": 0.4005505442619324, "rewards/TRLRewardAdapter/mean": 0.6962519288063049, "rewards/TRLRewardAdapter/std": 0.40055051445961, "sampling/importance_sampling_ratio/max": 1.4643410444259644, "sampling/importance_sampling_ratio/mean": 0.17876949906349182, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.78517150878906, "sampling/sampling_logp_difference/mean": 0.02407976984977722, "step": 507, "step_time": 386.4537497139536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05000000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9229.0, "completions/mean_length": 2135.103271484375, "completions/mean_terminated_length": 1721.1612548828125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.19933857520421347, "epoch": 0.3391188251001335, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.0644788810883908e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1205161558.0, "reward": 0.6661067605018616, "reward_std": 0.41371873021125793, "rewards/TRLRewardAdapter/mean": 0.6661067605018616, "rewards/TRLRewardAdapter/std": 0.41371870040893555, "sampling/importance_sampling_ratio/max": 1.50993013381958, "sampling/importance_sampling_ratio/mean": 0.08696932345628738, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.296110153198242, "sampling/sampling_logp_difference/mean": 0.02537660114467144, "step": 508, "step_time": 285.44751247100066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 8854.0, "completions/mean_length": 1898.9573974609375, "completions/mean_terminated_length": 1628.6318359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18634562442700067, "epoch": 0.33978638184245663, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.0022182075979113e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1207441869.0, "reward": 0.7299714684486389, "reward_std": 0.3621157705783844, "rewards/TRLRewardAdapter/mean": 0.7299714684486389, "rewards/TRLRewardAdapter/std": 0.3621158003807068, "sampling/importance_sampling_ratio/max": 1.9583896398544312, "sampling/importance_sampling_ratio/mean": 0.1107674390077591, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.75, "sampling/sampling_logp_difference/mean": 0.023996898904442787, "step": 509, "step_time": 348.94722513016313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07604166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9999.0, "completions/mean_length": 2581.002197265625, "completions/mean_terminated_length": 1970.4193115234375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.17358088244994482, "epoch": 0.3404539385847797, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00012152300937440678, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1210360495.0, "reward": 0.6781449913978577, "reward_std": 0.388424813747406, "rewards/TRLRewardAdapter/mean": 0.6781449913978577, "rewards/TRLRewardAdapter/std": 0.3884248435497284, "sampling/importance_sampling_ratio/max": 2.429227590560913, "sampling/importance_sampling_ratio/mean": 0.17000862956047058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.0, "sampling/sampling_logp_difference/mean": 0.02301516756415367, "step": 510, "step_time": 390.408548056148 }, { "epoch": 0.3404539385847797, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.019347825533022053, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 7568.260869565217, "eval_completions/mean_length": 863.2612835427989, "eval_completions/mean_terminated_length": 683.077261219854, "eval_completions/min_length": 30.782608695652176, "eval_completions/min_terminated_length": 30.782608695652176, "eval_entropy": 0.18829013148079748, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1210360495.0, "eval_reward": 0.5655863375767417, "eval_reward_std": 0.46955289659292804, "eval_rewards/TRLRewardAdapter/mean": 0.5655863492385201, "eval_rewards/TRLRewardAdapter/std": 0.46955290566320007, "eval_runtime": 1368.6676, "eval_samples_per_second": 3.336, "eval_sampling/importance_sampling_ratio/max": 1.8836352099543032, "eval_sampling/importance_sampling_ratio/mean": 0.3961800103602202, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.716563338818757, "eval_sampling/sampling_logp_difference/mean": 0.024459308742181114, "eval_steps_per_second": 0.017, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05520833656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9773.0, "completions/mean_length": 2533.236572265625, "completions/mean_terminated_length": 2096.920654296875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.19366666674613953, "epoch": 0.3411214953271028, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 3.416107787071817e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1213258706.0, "reward": 0.629569411277771, "reward_std": 0.4287901520729065, "rewards/TRLRewardAdapter/mean": 0.6295693516731262, "rewards/TRLRewardAdapter/std": 0.4287901520729065, "sampling/importance_sampling_ratio/max": 2.2922885417938232, "sampling/importance_sampling_ratio/mean": 0.0855657085776329, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.0, "sampling/sampling_logp_difference/mean": 0.024603839963674545, "step": 511, "step_time": 391.0721011067508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9954.0, "completions/mean_length": 1617.4959716796875, "completions/mean_terminated_length": 1224.423095703125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.16873174657424292, "epoch": 0.3417890520694259, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.014923989029216e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1215233806.0, "reward": 0.710712194442749, "reward_std": 0.40057966113090515, "rewards/TRLRewardAdapter/mean": 0.710712194442749, "rewards/TRLRewardAdapter/std": 0.40057969093322754, "sampling/importance_sampling_ratio/max": 1.283668875694275, "sampling/importance_sampling_ratio/mean": 0.13144078850746155, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.101531982421875, "sampling/sampling_logp_difference/mean": 0.022480199113488197, "step": 512, "step_time": 350.044991797884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02291666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 8968.0, "completions/mean_length": 1447.596923828125, "completions/mean_terminated_length": 1247.0074462890625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18492172410090765, "epoch": 0.342456608811749, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0004935184676289702, "learning_rate": 5e-06, "loss": -0.0004, "num_tokens": 1217037227.0, "reward": 0.7428246140480042, "reward_std": 0.36879128217697144, "rewards/TRLRewardAdapter/mean": 0.7428245544433594, "rewards/TRLRewardAdapter/std": 0.36879125237464905, "sampling/importance_sampling_ratio/max": 1.776299238204956, "sampling/importance_sampling_ratio/mean": 0.1434594839811325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.75, "sampling/sampling_logp_difference/mean": 0.02371971867978573, "step": 513, "step_time": 250.91035283985548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06041666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9896.0, "completions/mean_length": 1854.072998046875, "completions/mean_terminated_length": 1330.277099609375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16488241404294968, "epoch": 0.3431241655540721, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.000195408626433357, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 1219273681.0, "reward": 0.7614519596099854, "reward_std": 0.35473597049713135, "rewards/TRLRewardAdapter/mean": 0.7614519000053406, "rewards/TRLRewardAdapter/std": 0.35473594069480896, "sampling/importance_sampling_ratio/max": 1.802841067314148, "sampling/importance_sampling_ratio/mean": 0.10388629138469696, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.656776428222656, "sampling/sampling_logp_difference/mean": 0.021793192252516747, "step": 514, "step_time": 339.7424647809239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03958333656191826, "completions/max_length": 10000.0, "completions/max_terminated_length": 9274.0, "completions/mean_length": 1998.52197265625, "completions/mean_terminated_length": 1668.742919921875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19217066218455633, "epoch": 0.3437917222963952, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 2.289986638682444e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1221689286.0, "reward": 0.6287978291511536, "reward_std": 0.42524704337120056, "rewards/TRLRewardAdapter/mean": 0.6287977695465088, "rewards/TRLRewardAdapter/std": 0.42524704337120056, "sampling/importance_sampling_ratio/max": 1.6907435655593872, "sampling/importance_sampling_ratio/mean": 0.15440590679645538, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.25, "sampling/sampling_logp_difference/mean": 0.024767789989709854, "step": 515, "step_time": 301.15365467604715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0468750037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9732.0, "completions/mean_length": 2350.22314453125, "completions/mean_terminated_length": 1974.00439453125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.18724621335665384, "epoch": 0.3444592790387183, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 4.432799709386719e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1224423036.0, "reward": 0.5615423917770386, "reward_std": 0.44386330246925354, "rewards/TRLRewardAdapter/mean": 0.5615423917770386, "rewards/TRLRewardAdapter/std": 0.44386327266693115, "sampling/importance_sampling_ratio/max": 1.9699798822402954, "sampling/importance_sampling_ratio/mean": 0.10968831926584244, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.498069763183594, "sampling/sampling_logp_difference/mean": 0.024201322346925735, "step": 516, "step_time": 377.75782835518476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03854166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9888.0, "completions/mean_length": 1669.3594970703125, "completions/mean_terminated_length": 1335.4117431640625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.1748036121328672, "epoch": 0.34512683578104136, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.966825654064541e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1226502005.0, "reward": 0.7609049081802368, "reward_std": 0.3696092963218689, "rewards/TRLRewardAdapter/mean": 0.7609049081802368, "rewards/TRLRewardAdapter/std": 0.3696092665195465, "sampling/importance_sampling_ratio/max": 2.625661611557007, "sampling/importance_sampling_ratio/mean": 0.20420432090759277, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.25, "sampling/sampling_logp_difference/mean": 0.023668739944696426, "step": 517, "step_time": 249.48678450402804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12083333730697632, "completions/max_length": 10000.0, "completions/max_terminated_length": 9930.0, "completions/mean_length": 2636.41357421875, "completions/mean_terminated_length": 1624.356689453125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.19055751462777457, "epoch": 0.34579439252336447, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 3.06366874183604e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1229466754.0, "reward": 0.6298866868019104, "reward_std": 0.4351421892642975, "rewards/TRLRewardAdapter/mean": 0.6298866271972656, "rewards/TRLRewardAdapter/std": 0.4351422190666199, "sampling/importance_sampling_ratio/max": 2.0538835525512695, "sampling/importance_sampling_ratio/mean": 0.13617494702339172, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.80926513671875, "sampling/sampling_logp_difference/mean": 0.024552464485168457, "step": 518, "step_time": 388.33889812708367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05312500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9554.0, "completions/mean_length": 1885.095947265625, "completions/mean_terminated_length": 1429.80419921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.18462053189675012, "epoch": 0.3464619492656876, "frac_reward_zero_std": 0.23333334922790527, "grad_norm": 3.3130901198188865e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1231733374.0, "reward": 0.565338134765625, "reward_std": 0.4513801336288452, "rewards/TRLRewardAdapter/mean": 0.565338134765625, "rewards/TRLRewardAdapter/std": 0.4513801336288452, "sampling/importance_sampling_ratio/max": 1.9078989028930664, "sampling/importance_sampling_ratio/mean": 0.16893965005874634, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.5, "sampling/sampling_logp_difference/mean": 0.024366222321987152, "step": 519, "step_time": 356.5724942620145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666883975267, "completions/max_length": 10000.0, "completions/max_terminated_length": 9339.0, "completions/mean_length": 1211.572998046875, "completions/mean_terminated_length": 1174.80126953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1634524886806806, "epoch": 0.3471295060080107, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.0005110728881814056, "learning_rate": 5e-06, "loss": 0.0008, "num_tokens": 1233368068.0, "reward": 0.7740312814712524, "reward_std": 0.3650806248188019, "rewards/TRLRewardAdapter/mean": 0.7740312814712524, "rewards/TRLRewardAdapter/std": 0.3650806248188019, "sampling/importance_sampling_ratio/max": 2.122714042663574, "sampling/importance_sampling_ratio/mean": 0.18235823512077332, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.0, "sampling/sampling_logp_difference/mean": 0.021696632727980614, "step": 520, "step_time": 176.200779470033 }, { "epoch": 0.3471295060080107, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.017391303918607857, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 7485.217391304348, "eval_completions/mean_length": 755.146288001019, "eval_completions/mean_terminated_length": 591.6506666100544, "eval_completions/min_length": 28.347826086956523, "eval_completions/min_terminated_length": 28.347826086956523, "eval_entropy": 0.1689087081214656, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1233368068.0, "eval_reward": 0.5673869485440461, "eval_reward_std": 0.4722440838813782, "eval_rewards/TRLRewardAdapter/mean": 0.5673869550228119, "eval_rewards/TRLRewardAdapter/std": 0.47224409165589704, "eval_runtime": 1364.8496, "eval_samples_per_second": 3.345, "eval_sampling/importance_sampling_ratio/max": 1.986682451289633, "eval_sampling/importance_sampling_ratio/mean": 0.4681600034236908, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 8.37354390517525, "eval_sampling/sampling_logp_difference/mean": 0.02245210248814977, "eval_steps_per_second": 0.017, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9981.0, "completions/mean_length": 1900.279296875, "completions/mean_terminated_length": 1520.4666748046875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.17130400985479355, "epoch": 0.3477970627503338, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.493275683999784e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1235615376.0, "reward": 0.7285114526748657, "reward_std": 0.3713792860507965, "rewards/TRLRewardAdapter/mean": 0.7285114526748657, "rewards/TRLRewardAdapter/std": 0.3713792562484741, "sampling/importance_sampling_ratio/max": 2.988496780395508, "sampling/importance_sampling_ratio/mean": 0.12787823379039764, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.0, "sampling/sampling_logp_difference/mean": 0.02194860205054283, "step": 521, "step_time": 241.62214529025368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05416667088866234, "completions/max_length": 10000.0, "completions/max_terminated_length": 9935.0, "completions/mean_length": 2118.213623046875, "completions/mean_terminated_length": 1666.8336181640625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.19350279619296393, "epoch": 0.3484646194926569, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 4.114477254596894e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1238128285.0, "reward": 0.6687093377113342, "reward_std": 0.40229013562202454, "rewards/TRLRewardAdapter/mean": 0.6687093377113342, "rewards/TRLRewardAdapter/std": 0.4022901654243469, "sampling/importance_sampling_ratio/max": 2.1254842281341553, "sampling/importance_sampling_ratio/mean": 0.1345578134059906, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.34498405456543, "sampling/sampling_logp_difference/mean": 0.024475490674376488, "step": 522, "step_time": 283.9742278229678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07604166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9842.0, "completions/mean_length": 1857.830322265625, "completions/mean_terminated_length": 1187.73046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.165101687113444, "epoch": 0.34913217623498, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 7.375667768717149e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1240353690.0, "reward": 0.6953634023666382, "reward_std": 0.42165306210517883, "rewards/TRLRewardAdapter/mean": 0.6953633427619934, "rewards/TRLRewardAdapter/std": 0.42165306210517883, "sampling/importance_sampling_ratio/max": 2.5020172595977783, "sampling/importance_sampling_ratio/mean": 0.19075576961040497, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.25, "sampling/sampling_logp_difference/mean": 0.021771913394331932, "step": 523, "step_time": 394.8803025800735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06041666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9729.0, "completions/mean_length": 1728.0167236328125, "completions/mean_terminated_length": 1196.115234375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.16083335628112158, "epoch": 0.34979973297730305, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.556702692923404e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1242444426.0, "reward": 0.6920388340950012, "reward_std": 0.4172891080379486, "rewards/TRLRewardAdapter/mean": 0.6920387744903564, "rewards/TRLRewardAdapter/std": 0.4172890782356262, "sampling/importance_sampling_ratio/max": 2.0222225189208984, "sampling/importance_sampling_ratio/mean": 0.2319495677947998, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.38402557373047, "sampling/sampling_logp_difference/mean": 0.021689817309379578, "step": 524, "step_time": 383.51149913086556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02500000223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 7714.0, "completions/mean_length": 1225.237548828125, "completions/mean_terminated_length": 1000.24365234375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.16664718836545944, "epoch": 0.35046728971962615, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00010745414928151397, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1244065486.0, "reward": 0.7586848139762878, "reward_std": 0.36686384677886963, "rewards/TRLRewardAdapter/mean": 0.7586848139762878, "rewards/TRLRewardAdapter/std": 0.36686381697654724, "sampling/importance_sampling_ratio/max": 2.045579671859741, "sampling/importance_sampling_ratio/mean": 0.1508396714925766, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.4859089851379395, "sampling/sampling_logp_difference/mean": 0.02293723076581955, "step": 525, "step_time": 296.74653904500883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07916667312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9497.0, "completions/mean_length": 1986.986572265625, "completions/mean_terminated_length": 1298.0848388671875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.15071594715118408, "epoch": 0.35113484646194926, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.3457027753881262e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1246427457.0, "reward": 0.6784467697143555, "reward_std": 0.42711588740348816, "rewards/TRLRewardAdapter/mean": 0.6784467101097107, "rewards/TRLRewardAdapter/std": 0.42711588740348816, "sampling/importance_sampling_ratio/max": 1.9992610216140747, "sampling/importance_sampling_ratio/mean": 0.1636747121810913, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.75, "sampling/sampling_logp_difference/mean": 0.020528379827737808, "step": 526, "step_time": 396.72098129824735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9493.0, "completions/mean_length": 1300.6229248046875, "completions/mean_terminated_length": 990.9363403320312, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1735694855451584, "epoch": 0.35180240320427236, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.00014925258401693312, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1248094263.0, "reward": 0.7791379690170288, "reward_std": 0.3687152862548828, "rewards/TRLRewardAdapter/mean": 0.779137909412384, "rewards/TRLRewardAdapter/std": 0.3687152862548828, "sampling/importance_sampling_ratio/max": 2.9953930377960205, "sampling/importance_sampling_ratio/mean": 0.16778495907783508, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.25, "sampling/sampling_logp_difference/mean": 0.022857142612338066, "step": 527, "step_time": 282.8428936980199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 10000.0, "completions/max_terminated_length": 7027.0, "completions/mean_length": 1688.5948486328125, "completions/mean_terminated_length": 1601.106201171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.18917085975408554, "epoch": 0.35246995994659547, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0001967595235182563, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1250158418.0, "reward": 0.6997719407081604, "reward_std": 0.38099414110183716, "rewards/TRLRewardAdapter/mean": 0.6997718811035156, "rewards/TRLRewardAdapter/std": 0.38099411129951477, "sampling/importance_sampling_ratio/max": 1.625059723854065, "sampling/importance_sampling_ratio/mean": 0.1248302087187767, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.25, "sampling/sampling_logp_difference/mean": 0.02503451704978943, "step": 528, "step_time": 207.51462176104542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04479166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 8570.0, "completions/mean_length": 2108.44384765625, "completions/mean_terminated_length": 1738.3924560546875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18033957233031592, "epoch": 0.3531375166889186, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 2.8966450565964662e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1252675932.0, "reward": 0.6696116328239441, "reward_std": 0.407985657453537, "rewards/TRLRewardAdapter/mean": 0.6696116328239441, "rewards/TRLRewardAdapter/std": 0.407985657453537, "sampling/importance_sampling_ratio/max": 2.3705408573150635, "sampling/importance_sampling_ratio/mean": 0.13501521944999695, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.25, "sampling/sampling_logp_difference/mean": 0.023583579808473587, "step": 529, "step_time": 222.67599717481062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 7246.0, "completions/mean_length": 1039.5198974609375, "completions/mean_terminated_length": 1030.17626953125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17613415916760763, "epoch": 0.3538050734312417, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 8.276141368539882e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1254175375.0, "reward": 0.8525848984718323, "reward_std": 0.26196911931037903, "rewards/TRLRewardAdapter/mean": 0.8525848388671875, "rewards/TRLRewardAdapter/std": 0.26196911931037903, "sampling/importance_sampling_ratio/max": 1.6898552179336548, "sampling/importance_sampling_ratio/mean": 0.15709862112998962, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.964843273162842, "sampling/sampling_logp_difference/mean": 0.023698974400758743, "step": 530, "step_time": 130.7280544588575 }, { "epoch": 0.3538050734312417, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.017608695218096607, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 7372.695652173913, "eval_completions/mean_length": 727.2971562924592, "eval_completions/mean_terminated_length": 561.1683681322181, "eval_completions/min_length": 29.130434782608695, "eval_completions/min_terminated_length": 29.130434782608695, "eval_entropy": 0.1673130010781081, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1254175375.0, "eval_reward": 0.5616976476233938, "eval_reward_std": 0.4741569459438324, "eval_rewards/TRLRewardAdapter/mean": 0.5616976722427036, "eval_rewards/TRLRewardAdapter/std": 0.47415695242259814, "eval_runtime": 1356.2859, "eval_samples_per_second": 3.367, "eval_sampling/importance_sampling_ratio/max": 2.1559083928232607, "eval_sampling/importance_sampling_ratio/mean": 0.4741066875665084, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 9.15972421998563, "eval_sampling/sampling_logp_difference/mean": 0.02282103920436424, "eval_steps_per_second": 0.017, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875000074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9614.0, "completions/mean_length": 1255.16357421875, "completions/mean_terminated_length": 1088.0648193359375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.1661213164528211, "epoch": 0.35447263017356473, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0009730762119648457, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1255853132.0, "reward": 0.7409085631370544, "reward_std": 0.3870871663093567, "rewards/TRLRewardAdapter/mean": 0.7409085035324097, "rewards/TRLRewardAdapter/std": 0.3870871961116791, "sampling/importance_sampling_ratio/max": 1.776301622390747, "sampling/importance_sampling_ratio/mean": 0.23071913421154022, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.25, "sampling/sampling_logp_difference/mean": 0.023039620369672775, "step": 531, "step_time": 353.0117586449487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0020833334419876337, "completions/max_length": 10000.0, "completions/max_terminated_length": 9544.0, "completions/mean_length": 1059.4730224609375, "completions/mean_terminated_length": 1040.8079833984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.14275612185398737, "epoch": 0.35514018691588783, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009737926628328403, "learning_rate": 5e-06, "loss": -0.0004, "num_tokens": 1257305778.0, "reward": 0.8338855504989624, "reward_std": 0.2860679030418396, "rewards/TRLRewardAdapter/mean": 0.8338854908943176, "rewards/TRLRewardAdapter/std": 0.2860679030418396, "sampling/importance_sampling_ratio/max": 2.2904105186462402, "sampling/importance_sampling_ratio/mean": 0.17836277186870575, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.223299980163574, "sampling/sampling_logp_difference/mean": 0.02027907967567444, "step": 532, "step_time": 146.06004826596472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04895833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9763.0, "completions/mean_length": 1862.5906982421875, "completions/mean_terminated_length": 1443.6878662109375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.17402488738298416, "epoch": 0.35580774365821094, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 7.36208313344298e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1259577897.0, "reward": 0.6998080015182495, "reward_std": 0.38744184374809265, "rewards/TRLRewardAdapter/mean": 0.6998080015182495, "rewards/TRLRewardAdapter/std": 0.38744181394577026, "sampling/importance_sampling_ratio/max": 1.8356568813323975, "sampling/importance_sampling_ratio/mean": 0.1137848049402237, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.45545196533203, "sampling/sampling_logp_difference/mean": 0.023243313655257225, "step": 533, "step_time": 351.745497521013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 7828.0, "completions/mean_length": 1126.4000244140625, "completions/mean_terminated_length": 1117.1470947265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1374763771891594, "epoch": 0.35647530040053405, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.00013733612258650207, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 1261111401.0, "reward": 0.757836103439331, "reward_std": 0.36188796162605286, "rewards/TRLRewardAdapter/mean": 0.757836103439331, "rewards/TRLRewardAdapter/std": 0.36188796162605286, "sampling/importance_sampling_ratio/max": 2.104832172393799, "sampling/importance_sampling_ratio/mean": 0.1509055495262146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.211973190307617, "sampling/sampling_logp_difference/mean": 0.01939314603805542, "step": 534, "step_time": 109.29908535792492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01979166828095913, "completions/max_length": 10000.0, "completions/max_terminated_length": 9652.0, "completions/mean_length": 1771.1844482421875, "completions/mean_terminated_length": 1605.033935546875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15853150437275568, "epoch": 0.35714285714285715, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 1.7818742520088656e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1263245850.0, "reward": 0.7202304005622864, "reward_std": 0.3941347599029541, "rewards/TRLRewardAdapter/mean": 0.7202303409576416, "rewards/TRLRewardAdapter/std": 0.3941347599029541, "sampling/importance_sampling_ratio/max": 1.9807506799697876, "sampling/importance_sampling_ratio/mean": 0.1145244836807251, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.25, "sampling/sampling_logp_difference/mean": 0.021432092413306236, "step": 535, "step_time": 245.23888318997342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07916667312383652, "completions/max_length": 10000.0, "completions/max_terminated_length": 9551.0, "completions/mean_length": 2563.133544921875, "completions/mean_terminated_length": 1923.7647705078125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.175484669705232, "epoch": 0.35781041388518026, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.6266519017339347e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1266210298.0, "reward": 0.6484798789024353, "reward_std": 0.4021458625793457, "rewards/TRLRewardAdapter/mean": 0.6484798192977905, "rewards/TRLRewardAdapter/std": 0.4021458625793457, "sampling/importance_sampling_ratio/max": 1.813800573348999, "sampling/importance_sampling_ratio/mean": 0.14248616993427277, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.91420364379883, "sampling/sampling_logp_difference/mean": 0.023602237924933434, "step": 536, "step_time": 380.2185949559789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9859.0, "completions/mean_length": 1658.845947265625, "completions/mean_terminated_length": 1562.1622314453125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.1663626879453659, "epoch": 0.35847797062750336, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 3.726947335715947e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1268310406.0, "reward": 0.6559662222862244, "reward_std": 0.41266822814941406, "rewards/TRLRewardAdapter/mean": 0.6559661626815796, "rewards/TRLRewardAdapter/std": 0.41266822814941406, "sampling/importance_sampling_ratio/max": 2.1001133918762207, "sampling/importance_sampling_ratio/mean": 0.16168782114982605, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.139067649841309, "sampling/sampling_logp_difference/mean": 0.022304927930235863, "step": 537, "step_time": 186.54094485286623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05000000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9990.0, "completions/mean_length": 1846.330322265625, "completions/mean_terminated_length": 1417.189697265625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.16568689296642938, "epoch": 0.3591455273698264, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.6409095470321432e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1270545059.0, "reward": 0.7413389086723328, "reward_std": 0.3836953639984131, "rewards/TRLRewardAdapter/mean": 0.741338849067688, "rewards/TRLRewardAdapter/std": 0.3836953341960907, "sampling/importance_sampling_ratio/max": 2.642005681991577, "sampling/importance_sampling_ratio/mean": 0.17799553275108337, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.101531982421875, "sampling/sampling_logp_difference/mean": 0.022766033187508583, "step": 538, "step_time": 340.6201287341537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9825.0, "completions/mean_length": 1954.159423828125, "completions/mean_terminated_length": 1721.3214111328125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.17906812330087027, "epoch": 0.3598130841121495, "frac_reward_zero_std": 0.0, "grad_norm": 5.087277069412089e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1272912604.0, "reward": 0.6977677941322327, "reward_std": 0.3690378963947296, "rewards/TRLRewardAdapter/mean": 0.6977677941322327, "rewards/TRLRewardAdapter/std": 0.3690378665924072, "sampling/importance_sampling_ratio/max": 1.4499925374984741, "sampling/importance_sampling_ratio/mean": 0.06417723000049591, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.763919830322266, "sampling/sampling_logp_difference/mean": 0.023791775107383728, "step": 539, "step_time": 321.8652476881398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 8630.0, "completions/mean_length": 1270.190673828125, "completions/mean_terminated_length": 1187.5740966796875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.17029187083244324, "epoch": 0.3604806408544726, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.991934689171173e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1274569555.0, "reward": 0.8035699725151062, "reward_std": 0.3227667510509491, "rewards/TRLRewardAdapter/mean": 0.8035699129104614, "rewards/TRLRewardAdapter/std": 0.3227667510509491, "sampling/importance_sampling_ratio/max": 1.9753611087799072, "sampling/importance_sampling_ratio/mean": 0.11749569326639175, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.455451965332031, "sampling/sampling_logp_difference/mean": 0.02290407381951809, "step": 540, "step_time": 208.83551880798768 }, { "epoch": 0.3604806408544726, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.02239130376635686, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8228.04347826087, "eval_completions/mean_length": 886.3930239470109, "eval_completions/mean_terminated_length": 677.7908882472826, "eval_completions/min_length": 33.608695652173914, "eval_completions/min_terminated_length": 33.608695652173914, "eval_entropy": 0.1562508849993996, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1274569555.0, "eval_reward": 0.5732129112533901, "eval_reward_std": 0.46680597766585974, "eval_rewards/TRLRewardAdapter/mean": 0.5732129242109216, "eval_rewards/TRLRewardAdapter/std": 0.46680598414462543, "eval_runtime": 1376.3998, "eval_samples_per_second": 3.317, "eval_sampling/importance_sampling_ratio/max": 2.2579745313395625, "eval_sampling/importance_sampling_ratio/mean": 0.4222023836944414, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 6.795360995375591, "eval_sampling/sampling_logp_difference/mean": 0.02147500459914622, "eval_steps_per_second": 0.017, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 10000.0, "completions/max_terminated_length": 9499.0, "completions/mean_length": 1405.604248046875, "completions/mean_terminated_length": 1315.1368408203125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1603437215089798, "epoch": 0.36114819759679573, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.760149742388831e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1276407287.0, "reward": 0.7397960424423218, "reward_std": 0.372149258852005, "rewards/TRLRewardAdapter/mean": 0.739795982837677, "rewards/TRLRewardAdapter/std": 0.3721492290496826, "sampling/importance_sampling_ratio/max": 1.8874117136001587, "sampling/importance_sampling_ratio/mean": 0.15717379748821259, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.327360153198242, "sampling/sampling_logp_difference/mean": 0.021715983748435974, "step": 541, "step_time": 247.0451735833194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06041666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9972.0, "completions/mean_length": 2684.638671875, "completions/mean_terminated_length": 2214.24951171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.1596498042345047, "epoch": 0.36181575433911883, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 8.17033459488798e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1279454908.0, "reward": 0.6498976349830627, "reward_std": 0.3917047083377838, "rewards/TRLRewardAdapter/mean": 0.649897575378418, "rewards/TRLRewardAdapter/std": 0.3917047083377838, "sampling/importance_sampling_ratio/max": 1.6167631149291992, "sampling/importance_sampling_ratio/mean": 0.1273314505815506, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.5, "sampling/sampling_logp_difference/mean": 0.02169795334339142, "step": 542, "step_time": 363.8207862698473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 8997.0, "completions/mean_length": 1412.7552490234375, "completions/mean_terminated_length": 1116.643310546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.144728514055411, "epoch": 0.36248331108144194, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0002187559472429106, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1281215281.0, "reward": 0.7564172148704529, "reward_std": 0.36479121446609497, "rewards/TRLRewardAdapter/mean": 0.7564171552658081, "rewards/TRLRewardAdapter/std": 0.3647911846637726, "sampling/importance_sampling_ratio/max": 2.472182273864746, "sampling/importance_sampling_ratio/mean": 0.15047021210193634, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.578078269958496, "sampling/sampling_logp_difference/mean": 0.019927596673369408, "step": 543, "step_time": 375.91543715191074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250002793967724, "completions/max_length": 10000.0, "completions/max_terminated_length": 8870.0, "completions/mean_length": 1652.107421875, "completions/mean_terminated_length": 1625.9383544921875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16494979212681452, "epoch": 0.36315086782376504, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 7.782099613949147e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1283290392.0, "reward": 0.6985541582107544, "reward_std": 0.3744993507862091, "rewards/TRLRewardAdapter/mean": 0.6985540986061096, "rewards/TRLRewardAdapter/std": 0.3744993507862091, "sampling/importance_sampling_ratio/max": 1.8612210750579834, "sampling/importance_sampling_ratio/mean": 0.11271993815898895, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.966457843780518, "sampling/sampling_logp_difference/mean": 0.022269010543823242, "step": 544, "step_time": 156.20490996877197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333395421505, "completions/max_length": 10000.0, "completions/max_terminated_length": 9666.0, "completions/mean_length": 1620.66259765625, "completions/mean_terminated_length": 1442.378662109375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.16493326673905054, "epoch": 0.3638184245660881, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 5.844915656147409e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1285302132.0, "reward": 0.7391926050186157, "reward_std": 0.37200310826301575, "rewards/TRLRewardAdapter/mean": 0.7391926050186157, "rewards/TRLRewardAdapter/std": 0.37200313806533813, "sampling/importance_sampling_ratio/max": 1.964023232460022, "sampling/importance_sampling_ratio/mean": 0.11696642637252808, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.25, "sampling/sampling_logp_difference/mean": 0.022806324064731598, "step": 545, "step_time": 289.36511019896716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 7709.0, "completions/mean_length": 1383.214599609375, "completions/mean_terminated_length": 1329.0208740234375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.15186189860105515, "epoch": 0.3644859813084112, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00013785927335410763, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1287045826.0, "reward": 0.7907062768936157, "reward_std": 0.31565287709236145, "rewards/TRLRewardAdapter/mean": 0.7907062768936157, "rewards/TRLRewardAdapter/std": 0.31565287709236145, "sampling/importance_sampling_ratio/max": 1.8013827800750732, "sampling/importance_sampling_ratio/mean": 0.16906659305095673, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.134025573730469, "sampling/sampling_logp_difference/mean": 0.020799636840820312, "step": 546, "step_time": 155.81284818297718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05312500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 8432.0, "completions/mean_length": 1922.2740478515625, "completions/mean_terminated_length": 1469.0682373046875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.1760214144984881, "epoch": 0.3651535380507343, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.3201774608698694e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1289337193.0, "reward": 0.6836521029472351, "reward_std": 0.4101541340351105, "rewards/TRLRewardAdapter/mean": 0.6836520433425903, "rewards/TRLRewardAdapter/std": 0.4101541340351105, "sampling/importance_sampling_ratio/max": 1.7343868017196655, "sampling/importance_sampling_ratio/mean": 0.08564875274896622, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.0, "sampling/sampling_logp_difference/mean": 0.023436201736330986, "step": 547, "step_time": 304.497659850982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9498.0, "completions/mean_length": 1387.3167724609375, "completions/mean_terminated_length": 1099.918212890625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1495002657175064, "epoch": 0.3658210947930574, "frac_reward_zero_std": 0.0, "grad_norm": 9.376835578944993e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1291124121.0, "reward": 0.7925580143928528, "reward_std": 0.3351937532424927, "rewards/TRLRewardAdapter/mean": 0.792557954788208, "rewards/TRLRewardAdapter/std": 0.3351937532424927, "sampling/importance_sampling_ratio/max": 2.9260494709014893, "sampling/importance_sampling_ratio/mean": 0.14699886739253998, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.306852340698242, "sampling/sampling_logp_difference/mean": 0.020927006378769875, "step": 548, "step_time": 365.7065826409962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9964.0, "completions/mean_length": 1220.4510498046875, "completions/mean_terminated_length": 1004.9444580078125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.13130521401762962, "epoch": 0.3664886515353805, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 6.526120132082608e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1292782698.0, "reward": 0.7953214049339294, "reward_std": 0.34553804993629456, "rewards/TRLRewardAdapter/mean": 0.7953213453292847, "rewards/TRLRewardAdapter/std": 0.34553802013397217, "sampling/importance_sampling_ratio/max": 2.1354622840881348, "sampling/importance_sampling_ratio/mean": 0.22063109278678894, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.75, "sampling/sampling_logp_difference/mean": 0.018961859866976738, "step": 549, "step_time": 337.6669520168798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9452.0, "completions/mean_length": 1301.61572265625, "completions/mean_terminated_length": 1172.8868408203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.14090751856565475, "epoch": 0.3671562082777036, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.738508611269706e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1294465593.0, "reward": 0.7942811846733093, "reward_std": 0.33181342482566833, "rewards/TRLRewardAdapter/mean": 0.7942811250686646, "rewards/TRLRewardAdapter/std": 0.33181342482566833, "sampling/importance_sampling_ratio/max": 1.8457024097442627, "sampling/importance_sampling_ratio/mean": 0.11924993991851807, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.998071670532227, "sampling/sampling_logp_difference/mean": 0.0199785977602005, "step": 550, "step_time": 300.75409084907733 }, { "epoch": 0.3671562082777036, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.024347825299786484, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8530.08695652174, "eval_completions/mean_length": 923.3449760105299, "eval_completions/mean_terminated_length": 696.9194203252378, "eval_completions/min_length": 33.43478260869565, "eval_completions/min_terminated_length": 33.43478260869565, "eval_entropy": 0.15479747627092444, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1294465593.0, "eval_reward": 0.5779172311658445, "eval_reward_std": 0.4667109471300374, "eval_rewards/TRLRewardAdapter/mean": 0.5779172337573507, "eval_rewards/TRLRewardAdapter/std": 0.46671096267907514, "eval_runtime": 1383.3217, "eval_samples_per_second": 3.301, "eval_sampling/importance_sampling_ratio/max": 2.0858219706493877, "eval_sampling/importance_sampling_ratio/mean": 0.4276330976382546, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 6.26993903906449, "eval_sampling/sampling_logp_difference/mean": 0.021141550868101742, "eval_steps_per_second": 0.017, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0677083358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9888.0, "completions/mean_length": 1852.7333984375, "completions/mean_terminated_length": 1261.0323486328125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.16569185008605322, "epoch": 0.3678237650200267, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.1749194077698964e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1296692185.0, "reward": 0.7534855008125305, "reward_std": 0.3557766079902649, "rewards/TRLRewardAdapter/mean": 0.7534855008125305, "rewards/TRLRewardAdapter/std": 0.3557765483856201, "sampling/importance_sampling_ratio/max": 1.7129346132278442, "sampling/importance_sampling_ratio/mean": 0.1275942325592041, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.5949821472168, "sampling/sampling_logp_difference/mean": 0.022130636498332024, "step": 551, "step_time": 392.4808739287546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000558793545, "completions/max_length": 10000.0, "completions/max_terminated_length": 7924.0, "completions/mean_length": 1147.15625, "completions/mean_terminated_length": 1091.4779052734375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.1387349913517634, "epoch": 0.3684913217623498, "frac_reward_zero_std": 0.0, "grad_norm": 0.000166462188449691, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1298230031.0, "reward": 0.8261508941650391, "reward_std": 0.28958919644355774, "rewards/TRLRewardAdapter/mean": 0.8261508345603943, "rewards/TRLRewardAdapter/std": 0.28958919644355774, "sampling/importance_sampling_ratio/max": 1.3658475875854492, "sampling/importance_sampling_ratio/mean": 0.13092727959156036, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.285170555114746, "sampling/sampling_logp_difference/mean": 0.01941961981356144, "step": 552, "step_time": 155.5904058261076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 9959.0, "completions/mean_length": 2116.369873046875, "completions/mean_terminated_length": 1862.0592041015625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.1700902283191681, "epoch": 0.3691588785046729, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00022422645816335475, "learning_rate": 5e-06, "loss": 0.0027, "num_tokens": 1300699602.0, "reward": 0.6588638424873352, "reward_std": 0.40848782658576965, "rewards/TRLRewardAdapter/mean": 0.6588638424873352, "rewards/TRLRewardAdapter/std": 0.40848779678344727, "sampling/importance_sampling_ratio/max": 2.0118913650512695, "sampling/importance_sampling_ratio/mean": 0.10663725435733795, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.75, "sampling/sampling_logp_difference/mean": 0.022810814902186394, "step": 553, "step_time": 333.47960346296895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05416667088866234, "completions/max_length": 10000.0, "completions/max_terminated_length": 9761.0, "completions/mean_length": 2106.08447265625, "completions/mean_terminated_length": 1654.0098876953125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.16337889432907104, "epoch": 0.369826435246996, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 7.591298575174741e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1303174243.0, "reward": 0.724273681640625, "reward_std": 0.3708961606025696, "rewards/TRLRewardAdapter/mean": 0.7242736220359802, "rewards/TRLRewardAdapter/std": 0.3708961606025696, "sampling/importance_sampling_ratio/max": 2.2376701831817627, "sampling/importance_sampling_ratio/mean": 0.09919063001871109, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.31729507446289, "sampling/sampling_logp_difference/mean": 0.022513289004564285, "step": 554, "step_time": 400.61978760396596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11666667461395264, "completions/max_length": 10000.0, "completions/max_terminated_length": 10000.0, "completions/mean_length": 2356.11669921875, "completions/mean_terminated_length": 1346.5472412109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.13757477700710297, "epoch": 0.3704939919893191, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 3.7501095844594555e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1305893587.0, "reward": 0.6894934773445129, "reward_std": 0.4150082767009735, "rewards/TRLRewardAdapter/mean": 0.6894934177398682, "rewards/TRLRewardAdapter/std": 0.4150082767009735, "sampling/importance_sampling_ratio/max": 2.0836124420166016, "sampling/importance_sampling_ratio/mean": 0.2021070420742035, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.806854248046875, "sampling/sampling_logp_difference/mean": 0.01924159936606884, "step": 555, "step_time": 379.7900834950851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01979166828095913, "completions/max_length": 10000.0, "completions/max_terminated_length": 9383.0, "completions/mean_length": 1361.2125244140625, "completions/mean_terminated_length": 1186.7843017578125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1252410002052784, "epoch": 0.3711615487316422, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.7917430900529976e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1307661311.0, "reward": 0.819071352481842, "reward_std": 0.301696240901947, "rewards/TRLRewardAdapter/mean": 0.819071352481842, "rewards/TRLRewardAdapter/std": 0.301696240901947, "sampling/importance_sampling_ratio/max": 2.1403579711914062, "sampling/importance_sampling_ratio/mean": 0.15974298119544983, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.387245178222656, "sampling/sampling_logp_difference/mean": 0.017600281164050102, "step": 556, "step_time": 213.3534155798843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02187500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9866.0, "completions/mean_length": 1854.478271484375, "completions/mean_terminated_length": 1672.309814453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.13721854984760284, "epoch": 0.3718291054739653, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.0004243414304995071, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1309872106.0, "reward": 0.7260686755180359, "reward_std": 0.3694683909416199, "rewards/TRLRewardAdapter/mean": 0.7260686159133911, "rewards/TRLRewardAdapter/std": 0.3694683611392975, "sampling/importance_sampling_ratio/max": 2.0583841800689697, "sampling/importance_sampling_ratio/mean": 0.13823740184307098, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.539257049560547, "sampling/sampling_logp_difference/mean": 0.018762096762657166, "step": 557, "step_time": 223.0522999019595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05000000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9956.0, "completions/mean_length": 2045.36474609375, "completions/mean_terminated_length": 1626.6995849609375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15015187859535217, "epoch": 0.3724966622162884, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.3661384464765844e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1312307912.0, "reward": 0.7163940668106079, "reward_std": 0.3854575753211975, "rewards/TRLRewardAdapter/mean": 0.7163940668106079, "rewards/TRLRewardAdapter/std": 0.3854575455188751, "sampling/importance_sampling_ratio/max": 1.6978610754013062, "sampling/importance_sampling_ratio/mean": 0.1739652454853058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 29.5, "sampling/sampling_logp_difference/mean": 0.020102640613913536, "step": 558, "step_time": 302.2030205918709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00937500037252903, "completions/max_length": 10000.0, "completions/max_terminated_length": 9867.0, "completions/mean_length": 1113.284423828125, "completions/mean_terminated_length": 1029.182861328125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.15136014173428217, "epoch": 0.37316421895861146, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 4.099648604643822e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1313814265.0, "reward": 0.7486979365348816, "reward_std": 0.38022804260253906, "rewards/TRLRewardAdapter/mean": 0.7486979365348816, "rewards/TRLRewardAdapter/std": 0.38022804260253906, "sampling/importance_sampling_ratio/max": 2.477893829345703, "sampling/importance_sampling_ratio/mean": 0.20763841271400452, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.25, "sampling/sampling_logp_difference/mean": 0.020737210288643837, "step": 559, "step_time": 244.00873681611847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08125000447034836, "completions/max_length": 10000.0, "completions/max_terminated_length": 9870.0, "completions/mean_length": 2450.753173828125, "completions/mean_terminated_length": 1783.1326904296875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.15011544773976007, "epoch": 0.37383177570093457, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 3.662875310554416e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1316602092.0, "reward": 0.609053373336792, "reward_std": 0.42875611782073975, "rewards/TRLRewardAdapter/mean": 0.609053373336792, "rewards/TRLRewardAdapter/std": 0.42875611782073975, "sampling/importance_sampling_ratio/max": 1.7480120658874512, "sampling/importance_sampling_ratio/mean": 0.06992107629776001, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.806854248046875, "sampling/sampling_logp_difference/mean": 0.020632412284612656, "step": 560, "step_time": 384.72755671688356 }, { "epoch": 0.37383177570093457, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.021956521126887073, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8523.391304347826, "eval_completions/mean_length": 868.0378019913384, "eval_completions/mean_terminated_length": 663.1272383980129, "eval_completions/min_length": 31.26086956521739, "eval_completions/min_terminated_length": 31.26086956521739, "eval_entropy": 0.1496596265098323, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1316602092.0, "eval_reward": 0.577045933060024, "eval_reward_std": 0.46883403347886127, "eval_rewards/TRLRewardAdapter/mean": 0.5770459434260493, "eval_rewards/TRLRewardAdapter/std": 0.4688340477321459, "eval_runtime": 1374.6372, "eval_samples_per_second": 3.322, "eval_sampling/importance_sampling_ratio/max": 2.03576193685117, "eval_sampling/importance_sampling_ratio/mean": 0.4645899365777555, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 7.035195469856262, "eval_sampling/sampling_logp_difference/mean": 0.02067676162266213, "eval_steps_per_second": 0.017, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06979166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9997.0, "completions/mean_length": 2396.555419921875, "completions/mean_terminated_length": 1826.0841064453125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.16457987328370413, "epoch": 0.37449933244325767, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.8099854751733396e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1319328545.0, "reward": 0.6628841757774353, "reward_std": 0.40671154856681824, "rewards/TRLRewardAdapter/mean": 0.6628841161727905, "rewards/TRLRewardAdapter/std": 0.40671154856681824, "sampling/importance_sampling_ratio/max": 1.3165245056152344, "sampling/importance_sampling_ratio/mean": 0.1318296641111374, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.963584899902344, "sampling/sampling_logp_difference/mean": 0.021936869248747826, "step": 561, "step_time": 365.09843726688996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9375.0, "completions/mean_length": 2197.796875, "completions/mean_terminated_length": 1963.3958740234375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.15623395641644797, "epoch": 0.3751668891855808, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 8.762229072294557e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1321880414.0, "reward": 0.6704311370849609, "reward_std": 0.390243798494339, "rewards/TRLRewardAdapter/mean": 0.6704310774803162, "rewards/TRLRewardAdapter/std": 0.3902438282966614, "sampling/importance_sampling_ratio/max": 1.5883628129959106, "sampling/importance_sampling_ratio/mean": 0.08935713768005371, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.25, "sampling/sampling_logp_difference/mean": 0.02143513411283493, "step": 562, "step_time": 265.24981106002815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9964.0, "completions/mean_length": 2020.540771484375, "completions/mean_terminated_length": 1682.6483154296875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15463079263766608, "epoch": 0.3758344459279039, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.0192242947188956e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1324262373.0, "reward": 0.720316469669342, "reward_std": 0.3743928074836731, "rewards/TRLRewardAdapter/mean": 0.720316469669342, "rewards/TRLRewardAdapter/std": 0.3743927776813507, "sampling/importance_sampling_ratio/max": 1.5779961347579956, "sampling/importance_sampling_ratio/mean": 0.17765629291534424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.931852340698242, "sampling/sampling_logp_difference/mean": 0.021447723731398582, "step": 563, "step_time": 278.3674356510164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01145833358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9438.0, "completions/mean_length": 1222.39697265625, "completions/mean_terminated_length": 1120.654296875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.1496423656741778, "epoch": 0.376502002670227, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0001203694514832797, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1325875970.0, "reward": 0.8035911917686462, "reward_std": 0.3216959834098816, "rewards/TRLRewardAdapter/mean": 0.8035911321640015, "rewards/TRLRewardAdapter/std": 0.3216959834098816, "sampling/importance_sampling_ratio/max": 2.8523550033569336, "sampling/importance_sampling_ratio/mean": 0.16629667580127716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.162906169891357, "sampling/sampling_logp_difference/mean": 0.021039273589849472, "step": 564, "step_time": 199.3562791220611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02916666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9500.0, "completions/mean_length": 1334.956298828125, "completions/mean_terminated_length": 1074.633056640625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.1587515026330948, "epoch": 0.3771695594125501, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.0003006523385821003, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1327562872.0, "reward": 0.6910471320152283, "reward_std": 0.4204474985599518, "rewards/TRLRewardAdapter/mean": 0.6910470724105835, "rewards/TRLRewardAdapter/std": 0.4204474985599518, "sampling/importance_sampling_ratio/max": 2.0648014545440674, "sampling/importance_sampling_ratio/mean": 0.16121485829353333, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.75, "sampling/sampling_logp_difference/mean": 0.02174624241888523, "step": 565, "step_time": 314.4403955900343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0010416667209938169, "completions/max_length": 10000.0, "completions/max_terminated_length": 9110.0, "completions/mean_length": 1268.6458740234375, "completions/mean_terminated_length": 1259.541259765625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.14157777031262717, "epoch": 0.37783711615487314, "frac_reward_zero_std": 0.0, "grad_norm": 0.0002977400639857651, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 1329222116.0, "reward": 0.7674854397773743, "reward_std": 0.3411841094493866, "rewards/TRLRewardAdapter/mean": 0.7674854397773743, "rewards/TRLRewardAdapter/std": 0.3411841094493866, "sampling/importance_sampling_ratio/max": 2.6623871326446533, "sampling/importance_sampling_ratio/mean": 0.19941474497318268, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.3068528175354, "sampling/sampling_logp_difference/mean": 0.019594496116042137, "step": 566, "step_time": 89.9532415488502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05104167014360428, "completions/max_length": 10000.0, "completions/max_terminated_length": 9894.0, "completions/mean_length": 2059.05224609375, "completions/mean_terminated_length": 1631.9320068359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18096043666203818, "epoch": 0.37850467289719625, "frac_reward_zero_std": 0.0, "grad_norm": 2.0426490344064266e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1331722966.0, "reward": 0.7507511377334595, "reward_std": 0.3604431450366974, "rewards/TRLRewardAdapter/mean": 0.7507511377334595, "rewards/TRLRewardAdapter/std": 0.3604431450366974, "sampling/importance_sampling_ratio/max": 1.9343281984329224, "sampling/importance_sampling_ratio/mean": 0.10442008823156357, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.31532096862793, "sampling/sampling_logp_difference/mean": 0.024099407717585564, "step": 567, "step_time": 386.8106847630115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9770.0, "completions/mean_length": 1615.0084228515625, "completions/mean_terminated_length": 1481.9132080078125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.16457737733920416, "epoch": 0.37917222963951935, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 1.399756722166816e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1333743710.0, "reward": 0.6827651262283325, "reward_std": 0.4049307107925415, "rewards/TRLRewardAdapter/mean": 0.6827650666236877, "rewards/TRLRewardAdapter/std": 0.4049307405948639, "sampling/importance_sampling_ratio/max": 2.9449918270111084, "sampling/importance_sampling_ratio/mean": 0.13999870419502258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.5, "sampling/sampling_logp_difference/mean": 0.022860810160636902, "step": 568, "step_time": 308.970461958088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03437500074505806, "completions/max_length": 10000.0, "completions/max_terminated_length": 9918.0, "completions/mean_length": 1226.4376220703125, "completions/mean_terminated_length": 914.1099853515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.15337036550045013, "epoch": 0.37983978638184246, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.757765646417826e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1335333026.0, "reward": 0.8113387227058411, "reward_std": 0.34175950288772583, "rewards/TRLRewardAdapter/mean": 0.8113386631011963, "rewards/TRLRewardAdapter/std": 0.34175947308540344, "sampling/importance_sampling_ratio/max": 2.4042763710021973, "sampling/importance_sampling_ratio/mean": 0.17975442111492157, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 29.012245178222656, "sampling/sampling_logp_difference/mean": 0.02172723226249218, "step": 569, "step_time": 383.5293450288009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10729166865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9921.0, "completions/mean_length": 2345.98876953125, "completions/mean_terminated_length": 1426.0782470703125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16049070407946905, "epoch": 0.38050734312416556, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.00010066173344596618, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1338018071.0, "reward": 0.6382440328598022, "reward_std": 0.4364596903324127, "rewards/TRLRewardAdapter/mean": 0.6382439732551575, "rewards/TRLRewardAdapter/std": 0.4364596903324127, "sampling/importance_sampling_ratio/max": 2.135497570037842, "sampling/importance_sampling_ratio/mean": 0.17206287384033203, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.2108039855957, "sampling/sampling_logp_difference/mean": 0.02201642468571663, "step": 570, "step_time": 338.72723484714516 }, { "epoch": 0.38050734312416556, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.019565216792018517, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8083.260869565217, "eval_completions/mean_length": 855.2721557617188, "eval_completions/mean_terminated_length": 672.7642875339674, "eval_completions/min_length": 28.91304347826087, "eval_completions/min_terminated_length": 28.91304347826087, "eval_entropy": 0.14937320092450018, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1338018071.0, "eval_reward": 0.5789999236231265, "eval_reward_std": 0.46883673123691394, "eval_rewards/TRLRewardAdapter/mean": 0.578999936580658, "eval_rewards/TRLRewardAdapter/std": 0.468836740307186, "eval_runtime": 1375.9639, "eval_samples_per_second": 3.318, "eval_sampling/importance_sampling_ratio/max": 1.9882370389026145, "eval_sampling/importance_sampling_ratio/mean": 0.49030423423518305, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 5.114213865736256, "eval_sampling/sampling_logp_difference/mean": 0.020756071595394093, "eval_steps_per_second": 0.017, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1041666716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9841.0, "completions/mean_length": 2697.180419921875, "completions/mean_terminated_length": 1848.01513671875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1709949697057406, "epoch": 0.38117489986648867, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 2.6933000088601484e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1341031908.0, "reward": 0.6576182246208191, "reward_std": 0.41281062364578247, "rewards/TRLRewardAdapter/mean": 0.6576182246208191, "rewards/TRLRewardAdapter/std": 0.41281065344810486, "sampling/importance_sampling_ratio/max": 1.304757833480835, "sampling/importance_sampling_ratio/mean": 0.14588211476802826, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.466941833496094, "sampling/sampling_logp_difference/mean": 0.02327210083603859, "step": 571, "step_time": 392.4324764511548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9980.0, "completions/mean_length": 1681.9229736328125, "completions/mean_terminated_length": 1329.691650390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.17178324609994888, "epoch": 0.3818424566088118, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 1.942732807169173e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1343125722.0, "reward": 0.7418578267097473, "reward_std": 0.3936179280281067, "rewards/TRLRewardAdapter/mean": 0.7418577671051025, "rewards/TRLRewardAdapter/std": 0.3936178982257843, "sampling/importance_sampling_ratio/max": 2.157419443130493, "sampling/importance_sampling_ratio/mean": 0.18127994239330292, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.05488395690918, "sampling/sampling_logp_difference/mean": 0.023603761568665504, "step": 572, "step_time": 290.6413403558545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01979166828095913, "completions/max_length": 10000.0, "completions/max_terminated_length": 9329.0, "completions/mean_length": 1438.3271484375, "completions/mean_terminated_length": 1265.4559326171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.1638483057419459, "epoch": 0.3825100133511348, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.714841508450503e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1344915732.0, "reward": 0.7047051191329956, "reward_std": 0.4125956594944, "rewards/TRLRewardAdapter/mean": 0.7047051191329956, "rewards/TRLRewardAdapter/std": 0.41259562969207764, "sampling/importance_sampling_ratio/max": 2.609915018081665, "sampling/importance_sampling_ratio/mean": 0.20818400382995605, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 29.17110824584961, "sampling/sampling_logp_difference/mean": 0.022897472605109215, "step": 573, "step_time": 294.3631983367959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9976.0, "completions/mean_length": 1525.1187744140625, "completions/mean_terminated_length": 1289.2012939453125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1430619756380717, "epoch": 0.38317757009345793, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00031884313209341094, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1346770790.0, "reward": 0.7781817317008972, "reward_std": 0.36392727494239807, "rewards/TRLRewardAdapter/mean": 0.7781817317008972, "rewards/TRLRewardAdapter/std": 0.36392727494239807, "sampling/importance_sampling_ratio/max": 2.1415505409240723, "sampling/importance_sampling_ratio/mean": 0.24632516503334045, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.0, "sampling/sampling_logp_difference/mean": 0.020191410556435585, "step": 574, "step_time": 269.03374385018833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00729166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9929.0, "completions/mean_length": 1352.862548828125, "completions/mean_terminated_length": 1289.3472900390625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.14411567648251852, "epoch": 0.38384512683578104, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 2.3298767292206547e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1348564898.0, "reward": 0.7772690057754517, "reward_std": 0.35805848240852356, "rewards/TRLRewardAdapter/mean": 0.7772690057754517, "rewards/TRLRewardAdapter/std": 0.35805848240852356, "sampling/importance_sampling_ratio/max": 2.2189412117004395, "sampling/importance_sampling_ratio/mean": 0.20505771040916443, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.75, "sampling/sampling_logp_difference/mean": 0.020676633343100548, "step": 575, "step_time": 252.76038141117897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 8620.0, "completions/mean_length": 1209.300048828125, "completions/mean_terminated_length": 876.678955078125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1399358262618383, "epoch": 0.38451268357810414, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 3.595265089187545e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1350116066.0, "reward": 0.8054818511009216, "reward_std": 0.3349219858646393, "rewards/TRLRewardAdapter/mean": 0.8054817914962769, "rewards/TRLRewardAdapter/std": 0.3349219858646393, "sampling/importance_sampling_ratio/max": 1.9606943130493164, "sampling/importance_sampling_ratio/mean": 0.17543776333332062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 59.0, "sampling/sampling_logp_difference/mean": 0.0197818111628294, "step": 576, "step_time": 226.4863847978413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04791666939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9957.0, "completions/mean_length": 1920.8333740234375, "completions/mean_terminated_length": 1514.2232666015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.162629596889019, "epoch": 0.38518024032042725, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 3.9829908627435514e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1352411874.0, "reward": 0.728107213973999, "reward_std": 0.37855517864227295, "rewards/TRLRewardAdapter/mean": 0.728107213973999, "rewards/TRLRewardAdapter/std": 0.37855517864227295, "sampling/importance_sampling_ratio/max": 1.9420405626296997, "sampling/importance_sampling_ratio/mean": 0.16834327578544617, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.612422943115234, "sampling/sampling_logp_difference/mean": 0.02282695658504963, "step": 577, "step_time": 350.28503700206056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9915.0, "completions/mean_length": 1909.5782470703125, "completions/mean_terminated_length": 1603.4541015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.14723086853822073, "epoch": 0.38584779706275035, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 1.326386354321238e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1354646765.0, "reward": 0.7266812920570374, "reward_std": 0.38734960556030273, "rewards/TRLRewardAdapter/mean": 0.7266812920570374, "rewards/TRLRewardAdapter/std": 0.38734960556030273, "sampling/importance_sampling_ratio/max": 1.7782319784164429, "sampling/importance_sampling_ratio/mean": 0.21519342064857483, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.887248039245605, "sampling/sampling_logp_difference/mean": 0.020773520693182945, "step": 578, "step_time": 317.2008940981468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 10000.0, "completions/max_terminated_length": 9650.0, "completions/mean_length": 1501.9625244140625, "completions/mean_terminated_length": 1367.072998046875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.15837512910366058, "epoch": 0.38651535380507346, "frac_reward_zero_std": 0.03333333507180214, "grad_norm": 0.00029509145303870387, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1356556329.0, "reward": 0.7253735065460205, "reward_std": 0.3941391706466675, "rewards/TRLRewardAdapter/mean": 0.7253734469413757, "rewards/TRLRewardAdapter/std": 0.3941391408443451, "sampling/importance_sampling_ratio/max": 2.4470040798187256, "sampling/importance_sampling_ratio/mean": 0.22336483001708984, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.4375, "sampling/sampling_logp_difference/mean": 0.02240585722029209, "step": 579, "step_time": 250.82884403283242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06041666865348816, "completions/max_length": 10000.0, "completions/max_terminated_length": 9341.0, "completions/mean_length": 1665.7646484375, "completions/mean_terminated_length": 1129.8603515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.12914156168699265, "epoch": 0.3871829105473965, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.909847404593091e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1358652487.0, "reward": 0.7763299345970154, "reward_std": 0.3492048680782318, "rewards/TRLRewardAdapter/mean": 0.7763298749923706, "rewards/TRLRewardAdapter/std": 0.3492048680782318, "sampling/importance_sampling_ratio/max": 2.545245409011841, "sampling/importance_sampling_ratio/mean": 0.2472088634967804, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.125, "sampling/sampling_logp_difference/mean": 0.019094431772828102, "step": 580, "step_time": 363.81320697499905 }, { "epoch": 0.3871829105473965, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.02130434718792853, "eval_completions/max_length": 10000.0, "eval_completions/max_terminated_length": 8712.217391304348, "eval_completions/mean_length": 826.7608456818954, "eval_completions/mean_terminated_length": 626.853235659392, "eval_completions/min_length": 28.652173913043477, "eval_completions/min_terminated_length": 28.652173913043477, "eval_entropy": 0.1464863886003909, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1358652487.0, "eval_reward": 0.5799241221469381, "eval_reward_std": 0.47049766519795294, "eval_rewards/TRLRewardAdapter/mean": 0.579924137695976, "eval_rewards/TRLRewardAdapter/std": 0.4704976690852124, "eval_runtime": 1369.9659, "eval_samples_per_second": 3.333, "eval_sampling/importance_sampling_ratio/max": 2.104938859524934, "eval_sampling/importance_sampling_ratio/mean": 0.5093433584855951, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 12.048758698546369, "eval_sampling/sampling_logp_difference/mean": 0.020524917413359104, "eval_steps_per_second": 0.017, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04791666939854622, "completions/max_length": 10000.0, "completions/max_terminated_length": 9968.0, "completions/mean_length": 1701.455322265625, "completions/mean_terminated_length": 1283.80419921875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.16002057492733002, "epoch": 0.3878504672897196, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.0003645333427035675, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1360721564.0, "reward": 0.7125982046127319, "reward_std": 0.41270512342453003, "rewards/TRLRewardAdapter/mean": 0.7125981450080872, "rewards/TRLRewardAdapter/std": 0.41270512342453003, "sampling/importance_sampling_ratio/max": 2.253051519393921, "sampling/importance_sampling_ratio/mean": 0.18203027546405792, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.625, "sampling/sampling_logp_difference/mean": 0.02277790568768978, "step": 581, "step_time": 316.8643878180301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08750000596046448, "completions/max_length": 10000.0, "completions/max_terminated_length": 9802.0, "completions/mean_length": 2304.135498046875, "completions/mean_terminated_length": 1566.17578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.14409683893124262, "epoch": 0.3885180240320427, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 4.676918871057567e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1363336510.0, "reward": 0.692875325679779, "reward_std": 0.40463122725486755, "rewards/TRLRewardAdapter/mean": 0.6928752660751343, "rewards/TRLRewardAdapter/std": 0.40463122725486755, "sampling/importance_sampling_ratio/max": 2.0852112770080566, "sampling/importance_sampling_ratio/mean": 0.1629120111465454, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.5, "sampling/sampling_logp_difference/mean": 0.020440509542822838, "step": 582, "step_time": 337.00779427215457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06458333879709244, "completions/max_length": 10000.0, "completions/max_terminated_length": 9779.0, "completions/mean_length": 2678.640869140625, "completions/mean_terminated_length": 2173.156982421875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17054496457179388, "epoch": 0.3891855807743658, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0007106393887099103, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1366375685.0, "reward": 0.5870211124420166, "reward_std": 0.42860010266304016, "rewards/TRLRewardAdapter/mean": 0.5870210528373718, "rewards/TRLRewardAdapter/std": 0.4286000728607178, "sampling/importance_sampling_ratio/max": 1.6434991359710693, "sampling/importance_sampling_ratio/mean": 0.14081494510173798, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 65.0, "sampling/sampling_logp_difference/mean": 0.02349480614066124, "step": 583, "step_time": 285.98574150691275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07708333432674408, "completions/max_length": 10000.0, "completions/max_terminated_length": 9974.0, "completions/mean_length": 2535.33251953125, "completions/mean_terminated_length": 1911.87255859375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.15997222065925598, "epoch": 0.38985313751668893, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 7.572251459035409e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1369219012.0, "reward": 0.6584241390228271, "reward_std": 0.4132910966873169, "rewards/TRLRewardAdapter/mean": 0.6584241390228271, "rewards/TRLRewardAdapter/std": 0.4132910668849945, "sampling/importance_sampling_ratio/max": 2.226548910140991, "sampling/importance_sampling_ratio/mean": 0.17484894394874573, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.0, "sampling/sampling_logp_difference/mean": 0.02220935933291912, "step": 584, "step_time": 394.13456944806967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07083334028720856, "completions/max_length": 10000.0, "completions/max_terminated_length": 9992.0, "completions/mean_length": 1662.6708984375, "completions/mean_terminated_length": 1027.0897216796875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.17047365506490073, "epoch": 0.39052069425901204, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 5.181751853438204e-05, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1371223112.0, "reward": 0.7367725968360901, "reward_std": 0.38783130049705505, "rewards/TRLRewardAdapter/mean": 0.7367725372314453, "rewards/TRLRewardAdapter/std": 0.38783130049705505, "sampling/importance_sampling_ratio/max": 1.8392119407653809, "sampling/importance_sampling_ratio/mean": 0.19695711135864258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.5, "sampling/sampling_logp_difference/mean": 0.02325606346130371, "step": 585, "step_time": 380.2081052970607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04062500223517418, "completions/max_length": 10000.0, "completions/max_terminated_length": 9071.0, "completions/mean_length": 1449.1917724609375, "completions/mean_terminated_length": 1087.1053466796875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.14232054352760315, "epoch": 0.39118825100133514, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.0002603090317311224, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1373056992.0, "reward": 0.7553271651268005, "reward_std": 0.3881354033946991, "rewards/TRLRewardAdapter/mean": 0.7553271055221558, "rewards/TRLRewardAdapter/std": 0.3881354033946991, "sampling/importance_sampling_ratio/max": 2.9608993530273438, "sampling/importance_sampling_ratio/mean": 0.1935717761516571, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.88724708557129, "sampling/sampling_logp_difference/mean": 0.020354032516479492, "step": 586, "step_time": 391.79384384898003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08750000596046448, "completions/max_length": 10000.0, "completions/max_terminated_length": 9945.0, "completions/mean_length": 2243.674072265625, "completions/mean_terminated_length": 1499.9166259765625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.16705949107805887, "epoch": 0.3918558077436582, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 9.923268843706311e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1375656135.0, "reward": 0.6610710620880127, "reward_std": 0.4263043701648712, "rewards/TRLRewardAdapter/mean": 0.6610710024833679, "rewards/TRLRewardAdapter/std": 0.4263043701648712, "sampling/importance_sampling_ratio/max": 2.056272506713867, "sampling/importance_sampling_ratio/mean": 0.11332857608795166, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.71015167236328, "sampling/sampling_logp_difference/mean": 0.022627322003245354, "step": 587, "step_time": 367.50078433193266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03854166716337204, "completions/max_length": 10000.0, "completions/max_terminated_length": 9606.0, "completions/mean_length": 1884.4365234375, "completions/mean_terminated_length": 1559.1104736328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.15695008635520935, "epoch": 0.3925233644859813, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.00016756886008514508, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1377940618.0, "reward": 0.5846682786941528, "reward_std": 0.4541855454444885, "rewards/TRLRewardAdapter/mean": 0.5846682190895081, "rewards/TRLRewardAdapter/std": 0.4541855454444885, "sampling/importance_sampling_ratio/max": 2.5590522289276123, "sampling/importance_sampling_ratio/mean": 0.1603311151266098, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.0, "sampling/sampling_logp_difference/mean": 0.02192249707877636, "step": 588, "step_time": 323.68802904803306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04375000298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9820.0, "completions/mean_length": 2852.047119140625, "completions/mean_terminated_length": 2525.016357421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.1636831909418106, "epoch": 0.3931909212283044, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 6.779209466076819e-06, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1381176375.0, "reward": 0.576002836227417, "reward_std": 0.44899678230285645, "rewards/TRLRewardAdapter/mean": 0.576002836227417, "rewards/TRLRewardAdapter/std": 0.44899678230285645, "sampling/importance_sampling_ratio/max": 2.000391721725464, "sampling/importance_sampling_ratio/mean": 0.20237717032432556, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.373071670532227, "sampling/sampling_logp_difference/mean": 0.022895798087120056, "step": 589, "step_time": 333.39080166397616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05312500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9373.0, "completions/mean_length": 1294.03857421875, "completions/mean_terminated_length": 805.5852661132812, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.12930850187937418, "epoch": 0.3938584779706275, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.00021296020818957674, "learning_rate": 5e-06, "loss": 0.0034, "num_tokens": 1382851900.0, "reward": 0.7598354816436768, "reward_std": 0.3917582035064697, "rewards/TRLRewardAdapter/mean": 0.759835422039032, "rewards/TRLRewardAdapter/std": 0.39175817370414734, "sampling/importance_sampling_ratio/max": 2.3135509490966797, "sampling/importance_sampling_ratio/mean": 0.3108346462249756, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.5, "sampling/sampling_logp_difference/mean": 0.01863671839237213, "step": 590, "step_time": 284.7445003800094 }, { "epoch": 0.3938584779706275, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.019347825492529766, "eval_completions/max_length": 9985.0, "eval_completions/max_terminated_length": 8420.08695652174, "eval_completions/mean_length": 753.6056332795516, "eval_completions/mean_terminated_length": 571.1733504585598, "eval_completions/min_length": 21.043478260869566, "eval_completions/min_terminated_length": 21.043478260869566, "eval_entropy": 0.14849389830361243, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1382851900.0, "eval_reward": 0.577099700336871, "eval_reward_std": 0.4744553099507871, "eval_rewards/TRLRewardAdapter/mean": 0.5770997158859087, "eval_rewards/TRLRewardAdapter/std": 0.4744553164295528, "eval_runtime": 1366.5893, "eval_samples_per_second": 3.341, "eval_sampling/importance_sampling_ratio/max": 2.1062948392785112, "eval_sampling/importance_sampling_ratio/mean": 0.5338882778001868, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 4.9538179065870205, "eval_sampling/sampling_logp_difference/mean": 0.020978268073952717, "eval_steps_per_second": 0.017, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02812500111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9524.0, "completions/mean_length": 1476.345947265625, "completions/mean_terminated_length": 1229.6805419921875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.14111079027255377, "epoch": 0.3945260347129506, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0004021007385071464, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 1384747176.0, "reward": 0.7336202263832092, "reward_std": 0.39775317907333374, "rewards/TRLRewardAdapter/mean": 0.7336202263832092, "rewards/TRLRewardAdapter/std": 0.39775314927101135, "sampling/importance_sampling_ratio/max": 2.3014702796936035, "sampling/importance_sampling_ratio/mean": 0.20838835835456848, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.5, "sampling/sampling_logp_difference/mean": 0.019590046256780624, "step": 591, "step_time": 267.318884575041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05937500298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9913.0, "completions/mean_length": 2230.25732421875, "completions/mean_terminated_length": 1739.8084716796875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.1658633897701899, "epoch": 0.3951935914552737, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.00023671849502533246, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1387334015.0, "reward": 0.6365171670913696, "reward_std": 0.42973122000694275, "rewards/TRLRewardAdapter/mean": 0.6365171670913696, "rewards/TRLRewardAdapter/std": 0.42973119020462036, "sampling/importance_sampling_ratio/max": 1.4882545471191406, "sampling/importance_sampling_ratio/mean": 0.12827172875404358, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.5, "sampling/sampling_logp_difference/mean": 0.02250560186803341, "step": 592, "step_time": 403.9395955960499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02395833469927311, "completions/max_length": 10000.0, "completions/max_terminated_length": 9976.0, "completions/mean_length": 1343.010498046875, "completions/mean_terminated_length": 1130.51220703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.1595576455195745, "epoch": 0.39586114819759677, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.00026588890378468943, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1389006025.0, "reward": 0.6821211576461792, "reward_std": 0.43477556109428406, "rewards/TRLRewardAdapter/mean": 0.6821211576461792, "rewards/TRLRewardAdapter/std": 0.43477556109428406, "sampling/importance_sampling_ratio/max": 1.7670725584030151, "sampling/importance_sampling_ratio/mean": 0.27480119466781616, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.60660171508789, "sampling/sampling_logp_difference/mean": 0.022981110960245132, "step": 593, "step_time": 288.1138937351061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0364583358168602, "completions/max_length": 10000.0, "completions/max_terminated_length": 9821.0, "completions/mean_length": 1281.471923828125, "completions/mean_terminated_length": 951.5816650390625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.14328760405381522, "epoch": 0.3965287049399199, "frac_reward_zero_std": 0.06666667014360428, "grad_norm": 0.0007163435442271228, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1390612558.0, "reward": 0.7129787802696228, "reward_std": 0.4149150848388672, "rewards/TRLRewardAdapter/mean": 0.712978720664978, "rewards/TRLRewardAdapter/std": 0.4149150252342224, "sampling/importance_sampling_ratio/max": 2.1457245349884033, "sampling/importance_sampling_ratio/mean": 0.24476878345012665, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.13724708557129, "sampling/sampling_logp_difference/mean": 0.020732615143060684, "step": 594, "step_time": 316.1868372291792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10000.0, "completions/max_terminated_length": 9721.0, "completions/mean_length": 1497.6375732421875, "completions/mean_terminated_length": 1223.3677978515625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.13133839766184488, "epoch": 0.397196261682243, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 0.00024202629985546376, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1392485170.0, "reward": 0.7600169777870178, "reward_std": 0.40033194422721863, "rewards/TRLRewardAdapter/mean": 0.7600169777870178, "rewards/TRLRewardAdapter/std": 0.400331974029541, "sampling/importance_sampling_ratio/max": 2.2191355228424072, "sampling/importance_sampling_ratio/mean": 0.2856569290161133, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.194156646728516, "sampling/sampling_logp_difference/mean": 0.01954265497624874, "step": 595, "step_time": 288.1600417611189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02708333544433117, "completions/max_length": 10000.0, "completions/max_terminated_length": 9928.0, "completions/mean_length": 1721.8333740234375, "completions/mean_terminated_length": 1491.391845703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.12919090067346892, "epoch": 0.3978638184245661, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 0.00037006759999412103, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1394582546.0, "reward": 0.6623679399490356, "reward_std": 0.4387233853340149, "rewards/TRLRewardAdapter/mean": 0.6623678803443909, "rewards/TRLRewardAdapter/std": 0.4387233853340149, "sampling/importance_sampling_ratio/max": 2.2604827880859375, "sampling/importance_sampling_ratio/mean": 0.3100354075431824, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.0, "sampling/sampling_logp_difference/mean": 0.019559035077691078, "step": 596, "step_time": 297.0463792128721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01770833507180214, "completions/max_length": 10000.0, "completions/max_terminated_length": 9444.0, "completions/mean_length": 1910.3834228515625, "completions/mean_terminated_length": 1764.5472412109375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.1637455721696218, "epoch": 0.3985313751668892, "frac_reward_zero_std": 0.13333334028720856, "grad_norm": 0.00045136339727094686, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 1396926018.0, "reward": 0.6118760108947754, "reward_std": 0.4512173533439636, "rewards/TRLRewardAdapter/mean": 0.6118759512901306, "rewards/TRLRewardAdapter/std": 0.45121732354164124, "sampling/importance_sampling_ratio/max": 1.4498364925384521, "sampling/importance_sampling_ratio/mean": 0.1588914692401886, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.0, "sampling/sampling_logp_difference/mean": 0.022702975198626518, "step": 597, "step_time": 253.1290158709744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000111758709, "completions/max_length": 10000.0, "completions/max_terminated_length": 9949.0, "completions/mean_length": 1329.018798828125, "completions/mean_terminated_length": 1219.2593994140625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.13784814874331155, "epoch": 0.3991989319092123, "frac_reward_zero_std": 0.10000000894069672, "grad_norm": 0.0005510918960605673, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 1398640404.0, "reward": 0.6691594123840332, "reward_std": 0.43661099672317505, "rewards/TRLRewardAdapter/mean": 0.6691593527793884, "rewards/TRLRewardAdapter/std": 0.43661099672317505, "sampling/importance_sampling_ratio/max": 2.0003139972686768, "sampling/importance_sampling_ratio/mean": 0.26237064599990845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.756869316101074, "sampling/sampling_logp_difference/mean": 0.020248621702194214, "step": 598, "step_time": 198.79380327113904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05937500298023224, "completions/max_length": 10000.0, "completions/max_terminated_length": 9977.0, "completions/mean_length": 2245.36376953125, "completions/mean_terminated_length": 1755.8682861328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.14850561072429022, "epoch": 0.3998664886515354, "frac_reward_zero_std": 0.30000001192092896, "grad_norm": 0.0004375925776481509, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1401386033.0, "reward": 0.5673505663871765, "reward_std": 0.45688796043395996, "rewards/TRLRewardAdapter/mean": 0.5673505067825317, "rewards/TRLRewardAdapter/std": 0.45688796043395996, "sampling/importance_sampling_ratio/max": 2.229168653488159, "sampling/importance_sampling_ratio/mean": 0.17875085771083832, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.25, "sampling/sampling_logp_difference/mean": 0.02028873935341835, "step": 599, "step_time": 347.1967393888626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08437500149011612, "completions/max_length": 10000.0, "completions/max_terminated_length": 9988.0, "completions/mean_length": 2598.675048828125, "completions/mean_terminated_length": 1916.6416015625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.15244260678688684, "epoch": 0.40053404539385845, "frac_reward_zero_std": 0.20000001788139343, "grad_norm": 2.156179940511417e-05, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 1404349465.0, "reward": 0.5769863724708557, "reward_std": 0.451263427734375, "rewards/TRLRewardAdapter/mean": 0.5769863128662109, "rewards/TRLRewardAdapter/std": 0.451263427734375, "sampling/importance_sampling_ratio/max": 2.1790881156921387, "sampling/importance_sampling_ratio/mean": 0.19286100566387177, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.969566345214844, "sampling/sampling_logp_difference/mean": 0.021987024694681168, "step": 600, "step_time": 392.1271449038759 }, { "epoch": 0.40053404539385845, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.017608695137112038, "eval_completions/max_length": 9967.173913043478, "eval_completions/max_terminated_length": 8407.91304347826, "eval_completions/mean_length": 775.5382371985394, "eval_completions/mean_terminated_length": 610.5376971700917, "eval_completions/min_length": 20.26086956521739, "eval_completions/min_terminated_length": 20.26086956521739, "eval_entropy": 0.14455284048681674, "eval_frac_reward_zero_std": 1.0, "eval_loss": 0.0, "eval_num_tokens": 1404349465.0, "eval_reward": 0.588189581166143, "eval_reward_std": 0.4705213956210924, "eval_rewards/TRLRewardAdapter/mean": 0.5881895889406619, "eval_rewards/TRLRewardAdapter/std": 0.4705213930295861, "eval_runtime": 1362.8238, "eval_samples_per_second": 3.35, "eval_sampling/importance_sampling_ratio/max": 1.9258847599444182, "eval_sampling/importance_sampling_ratio/mean": 0.5319232513075289, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 6.4922278549360195, "eval_sampling/sampling_logp_difference/mean": 0.02074409292443939, "eval_steps_per_second": 0.017, "step": 600 } ], "logging_steps": 1.0, "max_steps": 1498, "num_input_tokens_seen": 1404349465, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }